python 生成词云

作者: LinJF | 来源:发表于2019-09-26 10:54 被阅读0次

用Python生成词云
python 词云生成
python 生成词云
python生成词云
python生成词云
Python3 生成中文词云
利用Python生成词云
利用python生成词云
Python jieba分词、词云、文件读取、函数调用、匿名函数
python 词云模块：wordcloud

各参数解释（部分）

font_path : string  #字体路径，需要展现什么字体就把该字体路径+后缀名写上，如：font_path = '黑体.ttf'

width : int (default=400) #输出的画布宽度，默认为400像素

height : int (default=200) #输出的画布高度，默认为200像素

prefer_horizontal : float (default=0.90) #词语水平方向排版出现的频率，默认 0.9 （所以词语垂直方向排版出现频率为 0.1 ）

mask : nd-array or None (default=None) #如果参数为空，则使用二维遮罩绘制词云。如果 mask 非空，设置的宽高值将被忽略，遮罩形状被 mask 取代。除全白（#FFFFFF）的部分将不会绘制，其余部分会用于绘制词云。如：bg_pic = imread('读取一张图片.png')，背景图片的画布一定要设置为白色（#FFFFFF），然后显示的形状为不是白色的其他颜色。可以用ps工具将自己要显示的形状复制到一个纯白色的画布上再保存，就ok了。

scale : float (default=1) #按照比例进行放大画布，如设置为1.5，则长和宽都是原来画布的1.5倍

min_font_size : int (default=4) #显示的最小的字体大小

font_step : int (default=1) #字体步长，如果步长大于1，会加快运算但是可能导致结果出现较大的误差

max_words : number (default=200) #要显示的词的最大个数

stopwords : set of strings or None #设置需要屏蔽的词，如果为空，则使用内置的STOPWORDS

background_color : color value (default=”black”) #背景颜色，如background_color='white',背景颜色为白色

max_font_size : int or None (default=None) #显示的最大的字体大小

mode : string (default=”RGB”) #当参数为“RGBA”并且background_color不为空时，背景为透明

relative_scaling : float (default=.5) #词频和字体大小的关联性

color_func : callable, default=None #生成新颜色的函数，如果为空，则使用 self.color_func

regexp : string or None (optional) #使用正则表达式分隔输入的文本

collocations : bool, default=True #是否包括两个词的搭配

colormap : string or matplotlib colormap, default=”viridis” #给每个单词随机分配颜色，若指定color_func，则忽略该方法

random_state : int or None  #为每个单词返回一个PIL颜色


fit_words(frequencies)  #根据词频生成词云
generate(text)  #根据文本生成词云
generate_from_frequencies(frequencies[, ...])   #根据词频生成词云
generate_from_text(text)    #根据文本生成词云
process_text(text)  #将长文本分词并去除屏蔽词（此处指英语，中文分词还是需要自己用别的库先行实现，使用上面的 fit_words(frequencies) ）
recolor([random_state, color_func, colormap])   #对现有输出重新着色。重新上色会比重新生成整个词云快很多
to_array()  #转化为 numpy array
to_file(filename)   #输出到文件

具体实现

1.jpg

import numpy as np
import matplotlib.pyplot as plt
#pip install WordCloud
from wordcloud import WordCloud,STOPWORDS
from PIL import Image
from os import path
import cv2
import matplotlib.pyplot as plt
#用来正常显示中文
plt.rcParams["font.sans-serif"]=["SimHei"]
#用来正常显示负号
plt.rcParams["axes.unicode_minus"]=False
import os
#pip install jieba
import random,jieba

'''
绘制单个词一个圆形的词云（设置每个值的权重）
'''
def single_wordColud_1():
    text = {"第一":0.1,"第二":0.2,"第三":0.3,"第四":0.4}
    #产生一个以(150,150)为圆心,半径为130的圆形mask
    x,y = np.ogrid[:300,:300]
    mask = (x-150) ** 2 + (y-150) ** 2 > 130 ** 2
    mask = 255 * mask.astype(int)
    wc = WordCloud(background_color="white",font_path='./simkai.ttf',repeat=True,mask=mask)
    wc.generate_from_frequencies(text) 
    #将x轴和y轴坐标隐藏
    plt.axis("off")
    plt.imshow(wc,interpolation="bilinear")
    plt.show()
    
'''
绘制单个词一个圆形的词云
'''
def single_wordColud():
    text = "第一 第二 第三 第四"
    #产生一个以(150,150)为圆心,半径为130的圆形mask
    x,y = np.ogrid[:300,:300]
    mask = (x-150) ** 2 + (y-150) ** 2 > 130 ** 2
    mask = 255 * mask.astype(int)
    wc = WordCloud(background_color="white",font_path='./simkai.ttf',repeat=True,mask=mask)
    wc.generate(text)

    #将x轴和y轴坐标隐藏
    plt.axis("off")
    plt.imshow(wc,interpolation="bilinear")
    plt.show()    

def grey_color_func(word,font_size,position,orientation,random_state=None,**kwargs):
    return "hsl(0,0%%,%d%%)"%random.randint(60,100)


'''
从文件中读取停用词
'''
def get_stopwords():
    dir_path = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
    #获取停用词的路径
    stopwords_path = os.path.join(dir_path,"txt/stopwords.txt")
    #创建set集合来保存停用词
    stopwords = set()
    #读取文件
    f = open(stopwords_path,"r",encoding="utf-8")
    line_contents = f.readline()
    while line_contents:
        #去掉回车
        line_contents = line_contents.replace("\n","").replace("\t","").replace("\u3000","")
        stopwords.add(line_contents)
        line_contents = f.readline()
    return stopwords

'''
中文分词
'''
def segment_words(text):
    article_contents = ""
    #使用jieba（结巴）进行分词
    words = jieba.cut(text,cut_all=False)
    for word in words:
        #使用空格来分割词
        article_contents += word+" "
    return article_contents



def drow_mask_wordColud():
    #获取当前文件的父目录
    #d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
    #mask = np.array(Image.open(path.join(d,"img/test.jpg")))
    
    #以下用咱们刚刚另存为的图就可以（必须是白色背景）
    mask = cv2.imread("img/1.jpg")
    
    #test.txt内容随便写
    text = open(path.join("txt/test.txt"),"r",encoding="utf-8").read().replace("\n","").replace("\t","").replace("\u3000","")
    #text = "甜心 美丽 漂亮 性感 贤惠 温柔 可爱 宝宝 排长"
    
    print(text)
    #对文本进行分词
    text = segment_words(text)
    #获取停用词
    stopwords = get_stopwords()
    #创建词云
    '''
    字体路径 ：simkai.ttf 简体字，解决汉字出现框框的问题 这个文件只要你安装了WordCloud第三方库就有的了，如果不知道路径，Everything直接搜索（简单粗暴）  
    scale:条件生成词云的清晰度，值越大越清晰 默认是1
    max_words:显示词的数量
    mask:背景
    stopwords:停用词,是一个set集合 有的话就自己定义就行了，或者用内置的STOPWORDS stopwords=STOPWORDS  或者直接不设置
    margin:词之间的间隔
    background_color:词云图片背景颜色
    repeat:为词是否可重复 true 为可重复  默认false 不可重复
    '''
    wc = WordCloud(scale=4,max_words=300,mask=mask,background_color="white",font_path='./simkai.ttf',stopwords=stopwords,margin=10,random_state=1).generate(text)
    default_colors = wc.to_array()
    # #保存词云图片（自定义）
    wc.to_file("img/test.png")
    plt.imshow(default_colors,interpolation="bilinear")
    plt.axis("off")
    plt.show()

if __name__ == "__main__":
    drow_mask_wordColud()
    #single_wordColud()
    #single_wordColud_1()

结果

test.png

自定义一个字体颜色

from wordcloud import WordCloud,get_single_color_func
import matplotlib.pyplot as plt

'''
定义一个字体颜色设置类
'''
class GroupedColorFunc(object):
    def __init__(self,color_to_words,default_color):
        self.color_func_to_words=[
            (get_single_color_func(color),set(words))
            for (color,words) in color_to_words.items()
        ]
        self.defalt_color_func=get_single_color_func(default_color)
    def get_color_func(self,word):
        try:
            #设置每个词的颜色
            color_func = next(color_func for (color_func,words) in self.color_func_to_words
                              if word in words)
        except StopIteration:
            #词的默认颜色
            color_func = self.defalt_color_func
        return color_func
    def __call__(self,word,**kwargs):
        return self.get_color_func(word)(word,**kwargs)


if __name__ == "__main__":
    text = "第一 第二 第三 第四 第五 第六"
    #创建词云
    wc = WordCloud(collocations=False,font_path='./simkai.ttf',background_color="white").generate(text)
    #设置词的颜色
    color_to_words={
        #使用RGB来设置词的颜色
        "#00ff00":["第一","第五"],
        "red":["第三","第六"],
        "yellow":["第二"]
    }
    #设置词默认的颜色
    default_color = "blue"
    grouped_color_func = GroupedColorFunc(color_to_words,default_color)
    #设置词云的颜色
    wc.recolor(color_func=grouped_color_func)
    #显示词云图
    plt.figure()
    plt.imshow(wc,interpolation="bilinear")
    plt.axis("off")
    plt.show()

注：运行过程，缺什么库导什么库

用Python生成词云
以下为简单的Python生成词云代码。基于Python3.7，macOS 10.14.2
python 词云生成
背景最近在研究一些深度学习序列模型,比如RNN和LSTM,这种主要来处理时序数据的神经网络。传统的语言模型主要是...
python 生成词云
各参数解释（部分）具体实现结果自定义一个字体颜色注：运行过程，缺什么库导什么库
python生成词云
“词云”这个概念由美国西北大学新闻学副教授、新媒体专业主任里奇·戈登（Rich Gordon）提出。 “词云”就是...
python生成词云
生成词云还挺简单的。就用wordcloud库就可以生成。WordCloud用的内容可以是字符串，也可以是dict....
Python3 生成中文词云
前提 Python 生成中文词云主要用到两个依赖库： jieba：中文分词工具 wordcloud：词云生成工具 ...
利用Python生成词云
用到的模块 matplotlib 用来画图 wordcloud 生成词云 jieba 中文分词 numpy ...
利用python生成词云
利用python的wordcloud包生成词云，分析前程无忧数据分析岗位的岗位职责和岗位要求效果
Python jieba分词、词云、文件读取、函数调用、匿名函数
词云的生成使用wordcloud 库生成词云安装wordcloud 调用wordcloud类，生成词云对象词...
python 词云模块：wordcloud
参考：生成词云之python中WordCloud包的用法https://amueller.github.io/wo...