python几万条微博高频词分析
看到别人有做影视热评的分析统计,觉得挺好玩的,就来试试
看看效果
Screenshot_2018-05-21-11-00-42-879_com.master.wei.png思路
抓取想要的微博数据写入数据库
分词统计出词汇出现次数
过滤无意义的干扰词
存入数据库
写接口,然后Android端展示
代码
- 抓取数据也是一门学问,请自行去学习
现成的sqlhttps://note.youdao.com/share/?id=f98dfc8417a2ae7d1990343e387e87b6&type=note#/
到到mysql中即可
- 数据库连接 masterWeiBo.Utils.Sql
import pymysql
import pymysql.cursors
import threading
class Mydb(object):
tableName='master'
def __init__(self):
self.lock=threading.Lock()
self.client = pymysql.connect(host='localhost',charset='utf8', port=3306, user='root', passwd='ck123', db='weibo', cursorclass=pymysql.cursors.DictCursor)
self.client.autocommit(True)
self.cursor = self.client.cursor()
- 开始
import jieba
from masterWeiBo.Utils.Sql import Mydb as db
# 创建停用词list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
cursor = db().cursor
#如果不存在词表就创建
cursor.execute("""CREATE TABLE IF NOT EXISTS `weibo`.`masterWeiBo_category` (
`id` INT NOT NULL AUTO_INCREMENT,
`count` INT NOT NULL DEFAULT 0,
`category` VARCHAR(100) NOT NULL,
`wordsTop10` VARCHAR(1000) NULL,
PRIMARY KEY (`id`));""")
#清空词表
cursor.execute("DELETE FROM weibo.masterWeiBo_category")
#获取分类分词
cursor.execute("SELECT count(id) as countd, come FROM weibo.masterWeiBo_master GROUP BY come")
results = cursor.fetchall()
print(results)
dicts=[]
#加载过滤词汇
stopwords = stopwordslist("/root/PYServer/myFirstPYServer/words.txt")
for result in results:
each={}
each['count']=result['countd']
each['come']=result['come']
print(result['countd'])
print(result['come'])
cursor.execute("SELECT content from weibo.masterWeiBo_master where come= '"+result['come']+"'")
contents = cursor.fetchall()
articals=''
#把指定分类的内容拼接起来
for artical in contents:
articals+=","+artical['content']
#结巴分词
cuts = jieba.cut(articals)
words={}
#统计词频
for cut in cuts:
if(cut in words):
words[cut]=words[cut]+1
else:
words[cut]=1
#按词频倒序排列
sortedWords = sorted(words.items(), key=lambda d: d[1], reverse=True)
wordsTop10=''
i=0
#获取top10词汇
for key ,value in sortedWords:
#过滤无效词汇
if(key in stopwords or key.__len__()<2):
continue
wordsTop10+=key+","+str(value)+";"
i+=1
if(i==10):
wordsTop10=wordsTop10[:wordsTop10.__len__()-1]
break
each['wordsTop10']=wordsTop10
dicts.append(each)
#写入数据库
for value in dicts:
sql = "INSERT INTO weibo.masterWeiBo_category (count,category,wordsTop10) values( '" + str(
value['count']) + "','" + value['come'] + "','" + value['wordsTop10'] + "')"
print(sql)
cursor.execute(sql)
cursor.close()
print(dicts)
网友评论