新浪博客博文文本聚类

作者: 木一晟 | 来源:发表于2017-03-02 18:12 被阅读49次

新浪博客博文文本聚类
新浪博客博文
unit3 文本聚类
请叫我博文考古学家！代找新浪博客博文
2018-12-19
测试
帝国cms采集小白教程一看就能懂的
pyhanlp文本聚类详细介绍
书斋随笔：新浪博客游记类博文命名由来小记
找到流量池，精准引流

前言

这是年前帮一位 QQ 好友完成论文报告所写的程序。有待完善。

以下是本次报告所使用的程序, 全部使用 Python 编写。根据需要，编写了以下四个程序。

article_base_info.py 用于抓取文章的基本信息：文章标题、链接、作者、发表日期
article_content_gevent.py 用于抓取文章内容
text_category.py 对文章进行分类
format_data.py 格式化数据

下面是程序代码

# coding: utf-8
'''
程序： article_base_info.py
1. 此程序通过给定的页数抓取新浪博客文章的基本信息：文章标题、链接、作者、发表日期
2. 数据保存到MongoDB中
'''
import re
import concurrent.futures
import requests
from bs4 import BeautifulSoup as bs
from pymongo import MongoClient


def fetch(url):
    res = requests.get(url)
    res.encoding = 'gbk'
    content = bs(res.text, 'lxml')
    return content


def base_info(html):
    pattern = re.compile(r'http://blog.sina.com.cn/s/blog_.*\.html')
    links = re.findall(pattern, str(html))
    date_ = re.findall(r'\((\d{2,}.*)\)', str(html))
    tle_auth = html.select('li')
    authes = (auth.text.split(' ')[0] for auth in tle_auth)
    titles = (title.text.split(' ')[-1] for title in tle_auth)
    for infos in zip(links, titles, authes, date_):
        yield infos


def save(url):
    html = fetch(url)
    data = base_info(html)
    client = MongoClient('localhost', 27017)
    db = client.infos
    coll = db.coll
    for num, d in enumerate(data, 1):
        datum = {
            'links': d[0],
            'title': d[1],
            'auther': d[2],
            'date': d[3]
        }

        count = coll.find({'links': d[0]}).count()
        if count == 0:
            coll.insert_one(datum)
    print('{} is grabbed'.format(urls))


if __name__ == '__main__':
    url = 'http://roll.blog.sina.com.cn/list/other/index_{}.shtml'

    start = int(input('请输入开始页数, 默认为1 >> '))
    if not start:
        start = 1

    end = int(input('输入结束页数， 默认为100 >> '))
    if not end:
        end = 100

    pages = range(start, end + 1)
    urls = [url.format(page) for page in pages]

    with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:
        executor.map(save, urls)

# -*-coding: utf-8 -*-
'''
程序： article_content_gevent.py
1. 此程序是用来抓取新浪博客的文章内容的!
2. 文章链接从 筛选后所有博客数据 .csv 读取, 此文件由 article_base_info.py 抓取生成。
3. 由于作者删帖或者其他方面的原因,导致文章本身已不存在而其链接仍存在的现象。
'''

import os
import csv
import logging
import requests
import gevent
from bs4 import BeautifulSoup as bs


def fetch(url):
    res = requests.get(url)
    res.encoding = 'utf-8'
    content = bs(res.text, 'lxml')
    if not content:
        logging.warning('The blog have been deleted!')
    return content


def content_get(html):
    try:
        artical = html.select('#sina_keyword_ad_area2')[0].text.strip()
    except IndexError as e:
        print(e)
        logging.warning('the page is None')
        artical = ' '
    return artical


def links_get(filename, urls=None):
    with open(filename, 'r') as csvfile:
        logging.info('readed the file {}'.format(filename))
        reader = csv.reader(csvfile)
        if urls is None:
            urls = []
        urls = [row[0] for row in reader]
    return urls


def download(url):
    html = fetch(url)
    artical = content_get(html)
    with open('/home/mouse/Documents/artical/{}.txt'
              .format(url[-12:-5]), 'w') as f:
        f.write(artical)
    logging.info('writring the {}'.format(url))


if __name__ == '__main__':
    logging.basicConfig(format='%(asctime)s %(message)s',
                        level=logging.WARNING)
    filename = '/home/mouse/我的坚果云/董姐的论文所需/筛选后所有博客数据.csv'
    urls = links_get(filename)
    if not os.path.isdir('/home/mouse/Documents/artical/'):
        os.makedirs('/home/mouse/Documents/artical/')
    threads = [gevent.spawn(download, url) for url in urls]
    gevent.joinall(threads)

# coding=utf-8
'''
程序： text_category.py
1. 此程序用于对从新浪博客抓取的文章进行自动分类
2. 分类所使用的库来自 https://github.com/2shou/TextGrocery
3. 工作流 读取已抓取的文章标题 -> 对文章分类 -> 写入分类后的文章标题
'''
import os
import csv
from tgrocery import Grocery
from train_txt import train_src


def category(title_lst, cates=None):  # 对文章分类
    if cates is None:
        cates = []
    for title in title_lst:
        cate = new_grocery.predict(title)
        cates.append(cate.predicted_y)
    return cates


def get_artical_title(filename, title_lst=None):  # 读取文本
    if title_lst is None:
        title_lst = []

    with open(filename, 'r') as f1:
        f1_csv = csv.reader(f1)
        title_lst = [row[1] for row in f1_csv]

    return title_lst


def write_cated_info(filename, new_filename):  # 写入已分类的文章
    titles = get_artical_title(filename)
    categ = category(titles)
    with open(filename, 'r') as read_file:
        reader = csv.reader(read_file)
        for i, row in enumerate(reader):
            row.append(categ[i])
            with open(new_filename, 'a+') as write_file:
                writer = csv.writer(write_file)
                writer.writerow(row)

            print 'writing the {} item'.format(i)
    print 'Done....................'


if __name__ == "__main__":
    # filename和new_filename是文件路径，保存读取和写入的文件
    # 更改路径名即可对不同的数据分类，前提要符合一定格式
    filename = '/home/mouse/我的坚果云/董姐的论文所需/female7.csv'
    new_filename = '/home/mouse/我的坚果云/董姐的论文所需/female7_2.csv'
    if os.path.isfile(new_filename):
        os.remove(new_filename)
    grocery = Grocery('sample')
    grocery.train(train_src)
    grocery.save()
    new_grocery = Grocery('sample')
    new_grocery.load()
    write_cated_info(filename, new_filename)

# -*- coding: utf-8 -*-
'''
程序： format_data.py
此程序是一个辅助程序，用于对 标题整理数据.xlsx 的格式化，标题整理数据手动转换为 csv 格式
'''

import csv
from collections import namedtuple
cate = ['社会冲突和问题', '毛泽东思想与政策', '政党与政府设置', '民主与法治', '民族和国际关系',
        '媒体与言论自由', '资本主义与市场经济', '全球化和对外开放', '民生与福利',
        '家庭冲突与伦理', '传统文化', '性与个人自由', '环境污染', '生态保护', ]
Category = namedtuple(
    'Category', 'social mao govm demcy nation media capi glob live home tran sex env eco')

filename = '/home/mouse/我的坚果云/董姐的论文所需/标题整理数据2.csv'


def train_text(filename, train_src=None):
    if train_src is None:
        train_src = []

    def format_cate():
        for emp in map(Category._make, csv.reader(open(filename, 'r'))):
            social = (cate[0], emp.social)
            mao = (cate[1], emp.mao)
            govm = (cate[2], emp.govm)
            demcy = (cate[3], emp.demcy)
            nation = (cate[4], emp.nation)
            media = (cate[5], emp.media)
            capi = (cate[6], emp.capi)
            glob = (cate[7], emp.glob)
            live = (cate[8], emp.live)
            home = (cate[9], emp.home)
            tran = (cate[10], emp.tran)
            sex = (cate[11], emp.sex)
            env = (cate[12], emp.env)
            eco = (cate[13], emp.eco)
            yield social, mao, govm, demcy, nation, media, capi, glob, \
                live, home, tran, sex, env, eco

    for cat in format_cate():
        train_src.extend(list(cat))

    return train_src

以上程序均有本人编写，并且全部我的电脑上运行通过，但未在其他电脑和平台上测试，由于各种依赖和兼容性问题以及本人水平有限，不保证他人也能正常运行此程序。