美文网首页
K均值聚类-可视化结果

K均值聚类-可视化结果

作者: 你的仙女本仙 | 来源:发表于2020-04-27 13:08 被阅读0次

    数据清洗
    将csv,excel等行式数据转换成二维excel数据
    import pandas as pd

    data = pd.read_csv("lipstick.csv",header=0,names=["A","B","C","D","E","F"])
    print(data["F"])

    result = open('lipstick1.txt', 'w', encoding='utf-8')
    for i in data["F"]:
    result.write(str(i).replace('\n', '"'))
    result.write('\n')

    数据处理

    import pandas as pd
    df=pd.read_excel("lip.xlsx")
    import matplotlib.pyplot as plt
    import seaborn as sns
    from pyecharts.charts import Bar, Pie,Map
    from pyecharts import options as opts
    import numpy as np
    
    plt.rcParams['font.sans-serif']=['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    
    
     s=df.groupby('shop').number_pay.sum().sort_values(ascending=False)[:30]\
         .plot(kind='bar',color=['r','g','b','g','b','tan','c'])
     plt.xlabel('店铺')
     plt.ylabel('购买人数')
     plt.savefig('购买人数最多的前30店铺.png',bbox_inches='tight')
     plt.show()
    
    y1=df.groupby('shop').amount.sum().sort_values(ascending=False)[:20]
    y_amount=pd.DataFrame(y1)
    print(y1.index,y1.values)
    
    
    color_series = ['#FAE927','#E9E416','#C9DA36','#9ECB3C','#6DBC49',
                    '#37B44E','#3DBA78','#14ADCF','#209AC9','#1E91CA',
                    '#2C6BA0','#2B55A1','#2D3D8E','#44388E','#6A368B',
                    '#7D3990','#A63F98','#C31C88','#D52178','#D5225B']
    pie1 = Pie(init_opts=opts.InitOpts(width='1350px', height='750px'))
    pie1.set_colors(color_series)
    pie1.add("", [list(z) for z in zip(y1.index, np.round(np.sqrt(y1.values),0))],
            radius=["20%", "100%"],
            center=["30%", "65%"],
            rosetype="area"
            )
    # 设置全局配置项
    pie1.set_global_opts(title_opts=opts.TitleOpts(title='玫瑰图示例'),
                         legend_opts=opts.LegendOpts(is_show=False),
                         toolbox_opts=opts.ToolboxOpts())
    # 设置系列配置项
    pie1.set_series_opts(label_opts=opts.LabelOpts(is_show=True,
                                                   position="inside",
                                                   font_size=12,
                                                   formatter="{b}:{c}",
                                                   font_style="italic",
                                                   font_weight="bold",
                                                   font_family="Microsoft YaHei"),)
    pie1.render('销售额前20店铺.html')
    
    image.png

    ################################店铺所在地区分布

    y2=df.groupby('location1').shop.count().sort_values(ascending=False)
    y_amount=pd.DataFrame(y2)
    data=[
    ('广东',991),
    ('上海',925),
    ('浙江',633),
    ('北京',362),
    ('江苏',353),
    ('山东',217),
    ('辽宁',103),
    ('香港',103),
    ('四川',95),
    ('福建',85),
    ('安徽',65),
    ('湖北',63),
    ('湖南',59),
    ('河北',57),
    ('黑龙江',54),
    ('天津',45),
    ('河南',36),
    ('江西',19),
    ('重庆',16),
    ('吉林',15),
    ('陕西',14),
    ('山西',12),
    ('广西',4),
    ('海南',4),
    ('云南',3),
    ('贵州',1),
    ('台湾',1),
    ('内蒙古',1),
    ('甘肃',1),
    
    ]
    china_map = (
            Map(init_opts=opts.InitOpts(theme='dark'))
            .add("",data,'china',is_map_symbol_show=False,  is_roam=False)
            .set_series_opts(label_opts=opts.LabelOpts(is_show=True, color='#ffffff'))
            .set_global_opts(
                title_opts=opts.TitleOpts(title="店铺所在地区分布地图"),
                legend_opts=opts.LegendOpts(is_show=False),
                visualmap_opts=opts.VisualMapOpts(max_=2000,
                                                  is_piecewise=True,
                                                  pieces=[
                                                      {"max": 999, "min": 500, "label": "500-999", "color": "#B40404"},
                                                      {"max": 499, "min": 100, "label": "100-499", "color": "#DF0101"},
                                                      {"max": 99, "min": 60, "label": "60-99", "color": "#F78181"},
                                                      {"max": 59, "min": 10, "label": "10-59", "color": "#F5A9A9"},
                                                      {"max": 9, "min": 0, "label": "1-9", "color": "#FFFFCC"},
                                                  ])
            )
    )
    china_map.render("店铺所在地区分布.html")
    
    image.png

    ##############每个店铺的口红单价情况

    y4=df.groupby('shop').price.mean().sort_values(ascending=False)
    print(y4)
    sns.distplot(df.groupby('shop').price.mean(),color="g",
                 norm_hist = True, hist_kws = {'color':'g'},
                 kde_kws=({'linestyle':'--', 'color':'red'}))
    plt.xlim(0,1000)
    plt.title('口红单价')
    plt.show()
    
    import jieba
    from wordcloud import WordCloud
    from PIL import Image
    
    filename = "lip_word.txt"
    with open(filename,encoding='UTF-8') as f:
     mytext = f.read()
    
    mytext = " ".join(jieba.cut(mytext))
    mytext.replace("/", '')
    mytext.replace("|", '')
    alice_mask = np.array(Image.open("lips.png"))
    wordcloud = WordCloud(font_path="simsun.ttf",background_color="white",
                          collocations=False,#非重复计数
                          width=800,
                          height=600,
                          mask=alice_mask,max_words=500).generate(mytext)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.figure()
    plt.imshow(alice_mask, cmap=plt.cm.gray, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    
    
    image.png
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.cluster import KMeans
    df=pd.read_excel("lip_sum_dataming.xlsx",index_col = 'index')
    
    df1=df[['price','number_pay','adress','if_company','if_off_shop']]#选中数据框某几列 df1nona=df1[df1['number_pay'].notna()]#原始数据#空值删除整行
    print(np.isnan(df1nona).any())#检查是否含有空行
    
    scale=MinMaxScaler().fit(df)#数据标准化
    df_scale=scale.transform(df)#标准化数据 #df1_scale=df_scale.tolist() 数组添加索引
    data_zs = 1.0*(df - df.mean())/df.std()#print(type(data_zs),data_zs.index)
    
    kmeans=KMeans(n_clusters=2).fit(data_zs)#构造聚类器,estimator初始化Kmeans聚类;estimator.fit聚类内容拟合;
    inertia = kmeans.inertia_ # 获取聚类准则的总和
    ssa=kmeans.inertia_#组内平方和
    y_kmeans2=kmeans.predict(data_zs)
    
    from sklearn.manifold import TSNE
    tsne = TSNE()
    tsne.fit_transform(data_zs) #进行数据降维,并返回结果
    tsne = pd.DataFrame(tsne.embedding_, index = data_zs.index) #转换数据格式
    
    r1 = pd.Series(kmeans.labels_).value_counts() #统计各个类别的数目
    r2 = pd.DataFrame(kmeans.cluster_centers_) #找出聚类中心
    r = pd.concat([r2, r1], axis = 1) #横向连接(0是纵向),得到聚类中心对应的类别下的数目
    
    r = pd.concat([data_zs, pd.Series(kmeans.labels_, index = data_zs.index)], axis = 1)  #详细输出每个样本对应的类别
    r.columns = list(df.columns) + [u'聚类类别'] #重命名表头
    r.to_excel('data_type_2.xls') #保存结果
    d = tsne[r[u'聚类类别'] == 0]  #找出聚类类别为0的数据对应的降维结果
    plt.plot(d[0], d[1], 'r.')
    d = tsne[r[u'聚类类别'] == 1]
    plt.plot(d[0], d[1], 'go')
    plt.show()
    
    image.png

    相关文章

      网友评论

          本文标题:K均值聚类-可视化结果

          本文链接:https://www.haomeiwen.com/subject/eteywhtx.html