前言
这是年前帮一位 QQ 好友完成论文报告所写的程序。有待完善。
以下是本次报告所使用的程序, 全部使用 Python
编写。根据需要,编写了以下四个程序。
- article_base_info.py 用于抓取文章的基本信息:文章标题、链接、作者、发表日期
- article_content_gevent.py 用于抓取文章内容
- text_category.py 对文章进行分类
- format_data.py 格式化数据
下面是程序代码
# coding: utf-8
'''
程序: article_base_info.py
1. 此程序通过给定的页数抓取新浪博客文章的基本信息:文章标题、链接、作者、发表日期
2. 数据保存到MongoDB中
'''
import re
import concurrent.futures
import requests
from bs4 import BeautifulSoup as bs
from pymongo import MongoClient
def fetch(url):
res = requests.get(url)
res.encoding = 'gbk'
content = bs(res.text, 'lxml')
return content
def base_info(html):
pattern = re.compile(r'http://blog.sina.com.cn/s/blog_.*\.html')
links = re.findall(pattern, str(html))
date_ = re.findall(r'\((\d{2,}.*)\)', str(html))
tle_auth = html.select('li')
authes = (auth.text.split(' ')[0] for auth in tle_auth)
titles = (title.text.split(' ')[-1] for title in tle_auth)
for infos in zip(links, titles, authes, date_):
yield infos
def save(url):
html = fetch(url)
data = base_info(html)
client = MongoClient('localhost', 27017)
db = client.infos
coll = db.coll
for num, d in enumerate(data, 1):
datum = {
'links': d[0],
'title': d[1],
'auther': d[2],
'date': d[3]
}
count = coll.find({'links': d[0]}).count()
if count == 0:
coll.insert_one(datum)
print('{} is grabbed'.format(urls))
if __name__ == '__main__':
url = 'http://roll.blog.sina.com.cn/list/other/index_{}.shtml'
start = int(input('请输入开始页数, 默认为1 >> '))
if not start:
start = 1
end = int(input('输入结束页数, 默认为100 >> '))
if not end:
end = 100
pages = range(start, end + 1)
urls = [url.format(page) for page in pages]
with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:
executor.map(save, urls)
# -*-coding: utf-8 -*-
'''
程序: article_content_gevent.py
1. 此程序是用来抓取新浪博客的文章内容的!
2. 文章链接从 筛选后所有博客数据 .csv 读取, 此文件由 article_base_info.py 抓取生成。
3. 由于作者删帖或者其他方面的原因,导致文章本身已不存在而其链接仍存在的现象。
'''
import os
import csv
import logging
import requests
import gevent
from bs4 import BeautifulSoup as bs
def fetch(url):
res = requests.get(url)
res.encoding = 'utf-8'
content = bs(res.text, 'lxml')
if not content:
logging.warning('The blog have been deleted!')
return content
def content_get(html):
try:
artical = html.select('#sina_keyword_ad_area2')[0].text.strip()
except IndexError as e:
print(e)
logging.warning('the page is None')
artical = ' '
return artical
def links_get(filename, urls=None):
with open(filename, 'r') as csvfile:
logging.info('readed the file {}'.format(filename))
reader = csv.reader(csvfile)
if urls is None:
urls = []
urls = [row[0] for row in reader]
return urls
def download(url):
html = fetch(url)
artical = content_get(html)
with open('/home/mouse/Documents/artical/{}.txt'
.format(url[-12:-5]), 'w') as f:
f.write(artical)
logging.info('writring the {}'.format(url))
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s %(message)s',
level=logging.WARNING)
filename = '/home/mouse/我的坚果云/董姐的论文所需/筛选后所有博客数据.csv'
urls = links_get(filename)
if not os.path.isdir('/home/mouse/Documents/artical/'):
os.makedirs('/home/mouse/Documents/artical/')
threads = [gevent.spawn(download, url) for url in urls]
gevent.joinall(threads)
# coding=utf-8
'''
程序: text_category.py
1. 此程序用于对从新浪博客抓取的文章进行自动分类
2. 分类所使用的库来自 https://github.com/2shou/TextGrocery
3. 工作流 读取已抓取的文章标题 -> 对文章分类 -> 写入分类后的文章标题
'''
import os
import csv
from tgrocery import Grocery
from train_txt import train_src
def category(title_lst, cates=None): # 对文章分类
if cates is None:
cates = []
for title in title_lst:
cate = new_grocery.predict(title)
cates.append(cate.predicted_y)
return cates
def get_artical_title(filename, title_lst=None): # 读取文本
if title_lst is None:
title_lst = []
with open(filename, 'r') as f1:
f1_csv = csv.reader(f1)
title_lst = [row[1] for row in f1_csv]
return title_lst
def write_cated_info(filename, new_filename): # 写入已分类的文章
titles = get_artical_title(filename)
categ = category(titles)
with open(filename, 'r') as read_file:
reader = csv.reader(read_file)
for i, row in enumerate(reader):
row.append(categ[i])
with open(new_filename, 'a+') as write_file:
writer = csv.writer(write_file)
writer.writerow(row)
print 'writing the {} item'.format(i)
print 'Done....................'
if __name__ == "__main__":
# filename和new_filename是文件路径,保存读取和写入的文件
# 更改路径名即可对不同的数据分类,前提要符合一定格式
filename = '/home/mouse/我的坚果云/董姐的论文所需/female7.csv'
new_filename = '/home/mouse/我的坚果云/董姐的论文所需/female7_2.csv'
if os.path.isfile(new_filename):
os.remove(new_filename)
grocery = Grocery('sample')
grocery.train(train_src)
grocery.save()
new_grocery = Grocery('sample')
new_grocery.load()
write_cated_info(filename, new_filename)
# -*- coding: utf-8 -*-
'''
程序: format_data.py
此程序是一个辅助程序,用于对 标题整理数据.xlsx 的格式化,标题整理数据手动转换为 csv 格式
'''
import csv
from collections import namedtuple
cate = ['社会冲突和问题', '毛泽东思想与政策', '政党与政府设置', '民主与法治', '民族和国际关系',
'媒体与言论自由', '资本主义与市场经济', '全球化和对外开放', '民生与福利',
'家庭冲突与伦理', '传统文化', '性与个人自由', '环境污染', '生态保护', ]
Category = namedtuple(
'Category', 'social mao govm demcy nation media capi glob live home tran sex env eco')
filename = '/home/mouse/我的坚果云/董姐的论文所需/标题整理数据2.csv'
def train_text(filename, train_src=None):
if train_src is None:
train_src = []
def format_cate():
for emp in map(Category._make, csv.reader(open(filename, 'r'))):
social = (cate[0], emp.social)
mao = (cate[1], emp.mao)
govm = (cate[2], emp.govm)
demcy = (cate[3], emp.demcy)
nation = (cate[4], emp.nation)
media = (cate[5], emp.media)
capi = (cate[6], emp.capi)
glob = (cate[7], emp.glob)
live = (cate[8], emp.live)
home = (cate[9], emp.home)
tran = (cate[10], emp.tran)
sex = (cate[11], emp.sex)
env = (cate[12], emp.env)
eco = (cate[13], emp.eco)
yield social, mao, govm, demcy, nation, media, capi, glob, \
live, home, tran, sex, env, eco
for cat in format_cate():
train_src.extend(list(cat))
return train_src
以上程序均有本人编写,并且全部我的电脑上运行通过,但未在其他电脑和平台上测试,由于各种依赖和兼容性问题以及本人水平有限,不保证他人也能正常运行此程序。
网友评论