import os
import re
import requests
from jieba import analyse
from lxml import etree
from numpy import mat
from numpy import zeros
def is_given_file_dir(func):
def fun(self):
if not os.path.exists(self.download_file_to_save):
os.makedirs(self.download_file_to_save)
print('创建TXT源数据目录 --> {} 成功!'.format(self.download_file_to_save))
if not os.path.exists(self.result_file_to_save):
os.makedirs(self.result_file_to_save)
print('创建结果目录 --> {} 成功!'.format(self.download_file_to_save))
func(self)
return fun
class SpeechAnalysis:
def __init__(self):
self.start_url = 'http://jhsjk.people.cn/'
self.domestic = 'result/1?area=401'
self.BASE_URL = os.path.dirname(__file__)
self.download_file_to_save = os.path.join(self.BASE_URL, 'download_data')
self.result_file_to_save = os.path.join(self.BASE_URL, 'results')
self.REGEX = '[: ,。!《》、—“”;?()]'
self.total = 0
@staticmethod
def parse(url): # 获取网页数据,解析为XML文档树
html = requests.get(url).content.decode()
return etree.HTML(html)
def get_page(self):
next_page = self.start_url + self.domestic
page = 1
while next_page:
print('正在采集第{}页数据...'.format(page))
tree = self.parse(next_page)
link = tree.xpath('//ul[@class="list_14 p1_2 clearfix"]//li/a/@href') # 利用 Xpath 提取 讲话标题的链接 和 文本内容
title = tree.xpath('//ul[@class="list_14 p1_2 clearfix"]//li/a/text()')
yield link, title, page
next_page = tree.xpath('//a[@rel="next"]/@href') # 更新 下一页链接 直到最后 为空 此时 爬取完 所有页面
if next_page:
next_page = next_page[0]
if page == 1: # if块 控制页码 注释if块 爬取所有页码 也可修改数字 爬取多少页
break
page += 1
@is_given_file_dir
def save_text(self):
for link_list, title_list, page in self.get_page():
i = 0
for link in link_list:
self.total += 1
print('正在保存第{}篇文章...'.format(self.total))
tree = self.parse(self.start_url + link)
txt = tree.xpath('string(//div[@class="d2txt_con clearfix"])').strip() # 利用Xpath提取 讲话内容的文本
file_name = re.compile('[\\\\/:*?"<>|]').sub('', title_list[i]) # 文件命名不允许出现的特殊字符
file = os.path.join(self.download_file_to_save, file_name.strip() + '.txt')
with open(file, 'w', encoding='utf-8') as f: # 以 讲话标题 为文件名 将 讲话内容写入文件
f.write(txt)
i += 1
if i == 3: # if块 控制 每页爬几篇 注释if块 每页爬取所有篇目 也可修改数字 每页爬取多少篇目
break
print('第{}页数据采集完毕!'.format(page))
def ergodic_txt(self):
for root, dirs, files in os.walk(self.download_file_to_save):
for file in files: # 遍历并处理txt文件
if file[-4:].lower() == '.txt':
yield os.path.join(root, file)
def write_txt(self):
pass
def tf_idf(self):
fre = set()
for txt in self.ergodic_txt():
with open(txt, 'r', encoding='utf-8') as f:
data = f.read()
top_20 = analyse.extract_tags(data) # 依据 TF-IDF算法 提取每篇文章的前20个关键词
fre.update(set(top_20)) # 所有关键词组成新集合
return fre
def freq_vector(self):
"""函数返回值为 词频矩阵 行 -> 处理的文档个数 列 -> 词频向量长度"""
all_words_set = self.tf_idf()
print(all_words_set)
root, dirs, files = next(os.walk(self.download_file_to_save))
m, n = len(files), len(all_words_set)
freq_mat = mat(zeros((m, n)))
i = 0
for txt in self.ergodic_txt():
with open(txt, 'r', encoding='utf-8') as f:
data = f.read()
freq_mat[i, :] = mat([data.count(x) for x in all_words_set])
i += 1
print(freq_mat)
return freq_mat
if __name__ == '__main__':
my_speech = SpeechAnalysis()
# my_speech.save_text()
my_speech.freq_vector()
网友评论