一、报告背景:
黑色星期五是美国感恩节后一天,圣诞节前的一次大采购活动,当天一般美国商场会推出大量的打折优惠、促销活动, 由于美国的商场一般以红笔记录赤字,以黑笔记录盈利,而感恩节后的这个星期五人们疯狂的抢购使得商场利润大增,因此被商家们称作黑色星期五。 商家期望通过以这一天开始的圣诞大采购为这一年获得最多的盈利。
二、报告目的:
1、本次的分析数据来自于Kaggle提供的某电商黑色星期五的销售记录, 将围绕产品和用户两大方面展开叙述,为电商平台制定策略提供分析及建议。
2、通过案例数据完成一份Python的数据分析报告。
三、数据来源:
数据:链接:https://pan.baidu.com/s/1wbglbeYG03fG49uRJAd2Cw 提取码:5b1d
编程:jupyter notebook
四、思维导图
序号:代表优先顺序
符号:√代表分析,X代表不分析

五、主要框架
1.整体消费的情况
2.用户画像分析(探究最优价值的用户类型:性别、年龄、职业、婚姻)
3.城市业绩分析(城市分布 、居住年限分布)
3.产品分析(探究最优价值的产品) 细化分析:产品销售额Top 3产品、产品销售额Top3 产品类别
4.结论以及建议(详见思维导图)
六、代码解析
#导入库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#设定绘图风格
plt.style.use('ggplot')
df=pd.read_csv(r'C:\Users\41174\Desktop\BlackFriday.csv')
print(df.head())

df.info()

原始数据中共有12个字段,每个字段共537577行,字段解释如下:
User_ID: 用户ID
Product_ID: 产品ID
Gender: 性别
Age: 年龄
Occupation: 职业
City_Category: 城市(A,B,C)
Stay_In_Current_City_Years: 居住时长
Marital_Status: 婚姻状况
Product_Category_1 产品类别1,是一级分类
Product_Category_2 产品类别2,是二级分类
Product_Category_3 产品类别3,是三级分类
Purchase: 金额(美元)
df.isna().any()

missing_percentage = (df.isnull().sum()/df.shape[0]*100).sort_values(ascending=False)
missing_percentage = missing_percentage[missing_percentage!=0].round(2)
print(missing_percentage)

其中产品类别2 和3 是有欠缺的
Product_Category_2 将近欠缺了31% 的数据
Product_Category_3 将近欠缺了69%的数据
df.fillna(0,inplace=True) #缺失值使用 0 填充
df.isna().any().sum() #结果没有缺失值
print(df["Purchase"].sum() /100000000)#总的消费金额是50.17亿美元
print(df["Product_ID"].count()/10000) #所销售的产品数量53.76万
print(df["Purchase"].sum()/df["Product_ID"].count()) #平均每个产品的价格是9333美元
print(df["Purchase"].sum()/df["User_ID"].drop_duplicates(keep='first').count()) #平均客单价是85万美元
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
df_Gender=df.groupby("Gender").Purchase.sum().reset_index()
df_Gender=df_Gender.apply(lambda x:x[1]/100000000,axis=1).reset_index(drop=True).reset_index().rename({0:"Purchase"},axis=1)
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.bar(('F','M'),df_Gender.Purchase)
ax.set_xlabel("性别")
ax.set_ylabel('亿元')
ax.set_ylim((0,45))
ax.set_title("不同性别的总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12,)
plt.show()

df_Gender=df.groupby("Gender").Purchase.sum().reset_index()
df_Gender=df_Gender.apply(lambda x:x[1]/100000000,axis=1).reset_index(drop=True).reset_index().rename({0:"Purchase"},axis=1)
fig,ax=plt.subplots(figsize=(6,6))
labels=[u"F",u"M"]
rects=plt.pie(df_Gender.Purchase,labels=labels,autopct='%1.0f%%')
ax.set_title('不同性别消费占比')
plt.show()

df_Age=df.groupby("Age").Purchase.sum().reset_index()
df_Age1=df_Age.drop(['Purchase'],axis=1).reset_index()
df_Age=df_Age.apply(lambda x:x[1]/100000000,axis=1).reset_index(drop=True).reset_index().rename({0:"Purchase"},axis=1)
df_Age=pd.merge(left=df_Age1,right=df_Age,how="inner",on='index',suffixes=('','_del'))
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.bar(df_Age.Age,df_Age.Purchase)
ax.set_xlabel("年龄")
ax.set_ylabel('亿元')
ax.set_ylim((0,25))
ax.set_title("不同年龄的总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12,)
plt.show()

df_Age=df.groupby("Age").Purchase.sum().reset_index()
df_Age1=df_Age.drop(['Purchase'],axis=1).reset_index()
df_Age=df_Age.apply(lambda x:x[1]/100000000,axis=1).reset_index(drop=True).reset_index().rename({0:"Purchase"},axis=1)
df_Age=pd.merge(left=df_Age1,right=df_Age,how="inner",on='index',suffixes=('','_del'))
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.pie(df_Age.Purchase,labels=df_Age.Age,autopct='%.0f%%')
ax.set_title("不同年龄的总消费额占比")

df_Occupation=df.groupby("Occupation").Purchase.sum().reset_index()
df_Occupation=df_Occupation.apply(lambda x:x[1]/10000000,axis=1).reset_index(drop=True).reset_index().rename({'index':'Occupation',0:"Purchase"},axis=1)
fig,ax=plt.subplots(figsize=(6,6))
df_Occupation.Occupation=df_Occupation.Occupation.values.astype('str')
rects=plt.bar(df_Occupation.Occupation,df_Occupation.Purchase)
ax.set_xlabel("职业")
ax.set_ylabel('千万')
ax.set_ylim((0,80))
ax.set_title("不同职业的总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12)
plt.show()

df_Occupation=df.groupby("Occupation").Purchase.sum().reset_index().sort_values(by="Purchase")
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.pie(df_Occupation.Purchase,labels=df_Occupation.Occupation,autopct='%.0f%%',startangle=90)
ax.set_title("不同职业的总消费额占比")

df_City_Category=df.groupby("City_Category").Purchase.sum().reset_index()
df_City_Category=df_City_Category.apply(lambda x:x[1]/10000000,axis=1).reset_index(drop=True).reset_index().rename({'index':'City_Category',0:"Purchase"},axis=1)
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.bar(("A","B","C"),df_City_Category.Purchase)
ax.set_xlabel("城市")
ax.set_ylabel('亿元')
ax.set_ylim((0,250))
ax.set_title("不同城市的总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12)
plt.show()

df_City_Category=df.groupby("City_Category").Purchase.sum().reset_index().sort_values(by="Purchase")
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.pie(df_City_Category.Purchase,labels=df_City_Category.City_Category,autopct='%.0f%%')
ax.set_title("不同城市的总消费额占比")

df_Stay_In_Current_City_Years=df.groupby("Stay_In_Current_City_Years").Purchase.sum().reset_index()
df_Stay_In_Current_City_Years=df_Stay_In_Current_City_Years.apply(lambda x:x[1]/10000000,axis=1).reset_index(drop=True).reset_index().rename({'index':'Stay_In_Current_City_Years',0:"Purchase"},axis=1)
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.bar(df_Stay_In_Current_City_Years.Stay_In_Current_City_Years,df_Stay_In_Current_City_Years.Purchase)
ax.set_xlabel("年限")
ax.set_ylabel('亿元')
ax.set_ylim((0,200))
ax.set_title("不同年限的总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12)
plt.show()

df_Stay_In_Current_City_Years=df.groupby("Stay_In_Current_City_Years").Purchase.sum().reset_index().sort_values(by="Purchase")
df_Stay_In_Current_City_Years.Stay_In_Current_City_Years=df_Stay_In_Current_City_Years.Stay_In_Current_City_Years.values.astype('str')
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Stay_In_Current_City_Years.Purchase,labels=df_Stay_In_Current_City_Years.Stay_In_Current_City_Years,autopct='%.0f%%',startangle=90)
ax.set_title("不同年限的总消费额占比")
plt.show()

df_User_ID_count=df.groupby('User_ID').Purchase.count().reset_index().rename({'Purchase':'Count'},axis=1).sort_values(by='Count',ascending=False).head(10)
df_User_ID_sum=df.groupby('User_ID').Purchase.sum().reset_index().sort_values(by='Purchase',ascending=False).head(10)
df_User_ID=pd.merge(left=df_User_ID_sum,right=df_User_ID_count,how='inner',on='User_ID',suffixes=('','_del'))
df_User_ID['Prep']=df_User_ID.apply(lambda x:int(x[1]/x[2]),axis=1)
print(df_User_ID)

df_Marital_Status=df.groupby("Marital_Status").Purchase.sum().reset_index()
df_Marital_Status=df_Marital_Status.apply(lambda x:x[1]/10000000,axis=1).reset_index(drop=True).reset_index().rename({'index':'Marital_Status',0:"Purchase"},axis=1)
fig,ax=plt.subplots(figsize=(6,6))
df_Marital_Status.Marital_Status=df_Marital_Status.Marital_Status.values.astype('str')
rects=plt.bar(df_Marital_Status.Marital_Status,df_Marital_Status.Purchase)
ax.set_xlabel("婚否")
ax.set_ylabel('亿元')
ax.set_ylim((0,350))
ax.set_title("婚否的总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12)
plt.show()

df_Marital_Status=df.groupby("Marital_Status").Purchase.sum().reset_index().sort_values(by="Purchase")
df_Marital_Status.Marital_Status=df_Marital_Status.Marital_Status.values.astype('str')
df_Marital_Status['Marital_Status'][0]='未婚'
df_Marital_Status['Marital_Status'][1]='已婚'
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Marital_Status.Purchase,labels=df_Marital_Status.Marital_Status,autopct='%.0f%%',startangle=90)
ax.set_title("婚否的总消费额占比")
plt.show()

df_Product_Category_1=df.groupby("Product_Category_1").Purchase.sum().reset_index()
df_Product_Category_1=df_Product_Category_1.apply(lambda x:x[1]/10000000,axis=1).reset_index().rename({'index':'Product_Category_1',0:"Purchase"},axis=1)
for i in range(18):
df_Product_Category_1.Product_Category_1[i]=i+1
fig,ax=plt.subplots(figsize=(6,6))
df_Product_Category_1.Product_Category_1=df_Product_Category_1.Product_Category_1.values.astype('str')
df_Product_Category_1=df_Product_Category_1.sort_values(by='Purchase',ascending=False)
rects=plt.bar(df_Product_Category_1.Product_Category_1,df_Product_Category_1.Purchase)
ax.set_xlabel("产品类别1")
ax.set_ylabel('千万')
ax.set_ylim((0,220))
ax.set_title("产品类别1的各类总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12)
plt.show()

df_Product_Category_1=df.groupby("Product_Category_1").Purchase.sum().reset_index().sort_values(by="Purchase")
df_Product_Category_1.Product_Category_1=df_Product_Category_1.Product_Category_1.values.astype('str')
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Product_Category_1.Purchase,labels=df_Product_Category_1.Product_Category_1,autopct='%.0f%%',startangle=90)
ax.set_title("产品类别1的各类总消费额占比")
plt.show()

df_Product_Category_2=df[df.Product_Category_2>0].groupby("Product_Category_2").Purchase.sum().reset_index()
df_Product_Category_2=df_Product_Category_2.apply(lambda x:x[1]/10000000,axis=1).reset_index().rename({'index':'Product_Category_2',0:"Purchase"},axis=1)
for i in range(17):
df_Product_Category_2.Product_Category_2[i]=i+2
fig,ax=plt.subplots(figsize=(6,6))
df_Product_Category_2.Product_Category_2=df_Product_Category_2.Product_Category_2.values.astype('str')
df_Product_Category_2=df_Product_Category_2.sort_values(by='Purchase',ascending=False)
rects=plt.bar(df_Product_Category_2.Product_Category_2,df_Product_Category_2.Purchase)
ax.set_xlabel("产品类别2")
ax.set_ylabel('千万')
ax.set_ylim((0,80))
ax.set_title("产品类别2的各类总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12)
plt.show()

df_Product_Category_2=df[df.Product_Category_2>0].groupby("Product_Category_2").Purchase.sum().reset_index().sort_values(by="Purchase")
df_Product_Category_2.Product_Category_2=df_Product_Category_2.Product_Category_2.values.astype('int')
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Product_Category_2.Purchase,labels=df_Product_Category_2.Product_Category_2,autopct='%.0f%%',startangle=90)
ax.set_title("产品类别2的各类总消费额占比")
plt.show()

df_Product_Category_3=df[df.Product_Category_3>0].groupby("Product_Category_3").Purchase.sum().reset_index()
df_Product_Category_3=df_Product_Category_3.apply(lambda x:x[1]/10000000,axis=1).reset_index().rename({'index':'Product_Category_3',0:"Purchase"},axis=1)
df_Product_Category_3.Product_Category_3=df_Product_Category_3.Product_Category_3.values.astype('str')
for i in range(15):
if i<=3:
df_Product_Category_3.Product_Category_3[i]=str(i+3)
else:
df_Product_Category_3.Product_Category_3[i]=str(i+4)
df_Product_Category_3=df_Product_Category_3.sort_values(by='Purchase',ascending=False)
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.bar(df_Product_Category_3.Product_Category_3,df_Product_Category_3.Purchase)
ax.set_xlabel("产品类别3")
ax.set_ylabel('千万')
ax.set_ylim((0,50))
ax.set_title("产品类别3的各类总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12)
plt.show()

df_Product_Category_3=df[df.Product_Category_3>0].groupby("Product_Category_3").Purchase.sum().reset_index().sort_values(by="Purchase")
df_Product_Category_3.Product_Category_3=df_Product_Category_3.Product_Category_3.values.astype('int')
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Product_Category_3.Purchase,labels=df_Product_Category_3.Product_Category_3,autopct='%.0f%%',startangle=90)
ax.set_title("产品类别2的各类总消费额占比")
plt.show()

df['Gender_Marital_Status_Age']=df.Gender
df.Marital_Status=df.Marital_Status.values.astype(str)
df['Gender_Marital_Status_Age']=df.Gender+'_'+df.Marital_Status+'_'+df.Age
df_Gender_Marital_Status_Age=df.groupby('Gender_Marital_Status_Age').Purchase.sum().reset_index()
df_Gender_Marital_Status_Age_left=df_Gender_Marital_Status_Age.drop(columns='Purchase').reset_index()
df_Gender_Marital_Status_Age=df_Gender_Marital_Status_Age.apply(lambda x:x[1]/100000000,axis=1).reset_index().rename({0:'Purchase'},axis=1)
df_Gender_Marital_Status_Age=pd.merge(left=df_Gender_Marital_Status_Age_left,right=df_Gender_Marital_Status_Age,how="inner",on='index',suffixes=('','_del'))
df_Gender_Marital_Status_Age=df_Gender_Marital_Status_Age.sort_values(by='Purchase',ascending=False)
fig,ax=plt.subplots(figsize=(12,7))
rects=plt.bar(df_Gender_Marital_Status_Age.Gender_Marital_Status_Age,df_Gender_Marital_Status_Age.Purchase)
ax.set_xticklabels(labels=df_Gender_Marital_Status_Age.Gender_Marital_Status_Age,rotation=90)
ax.set_ylim(0,12)
ax.set_title('细分性别-婚姻-年龄总消费额')
ax.set_ylabel('亿元')
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.2f'%height, ha='center', va='bottom',fontsize=12)
plt.show()

df_Gender_Marital_Status_Age=df.groupby("Gender_Marital_Status_Age").Purchase.sum().reset_index().sort_values(by="Purchase")
df_Gender_Marital_Status_Age.Gender_Marital_Status_Age=df_Gender_Marital_Status_Age.Gender_Marital_Status_Age.values.astype('str')
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Gender_Marital_Status_Age.Purchase,labels=df_Gender_Marital_Status_Age.Gender_Marital_Status_Age,autopct='%.0f%%',startangle=90)
ax.set_title("细分性别-婚姻-年龄总消费额占比")
plt.show()

df['Gender_Marital_Status_Age']=df.Gender
df.Marital_Status=df.Marital_Status.values.astype(str)
df.Occupation=df.Occupation.values.astype(str)
df['Gender_Marital_Status_Age_Occupation']=df.Gender+'_'+df.Marital_Status+'_'+df.Age+'_'+df.Occupation
df_Gender_Marital_Status_Age_Occupation=df.groupby('Gender_Marital_Status_Age_Occupation').Purchase.sum().reset_index()
df_Gender_Marital_Status_Age_Occupation_left=df_Gender_Marital_Status_Age_Occupation.drop(columns='Purchase').reset_index()
df_Gender_Marital_Status_Age_Occupation=df_Gender_Marital_Status_Age_Occupation.apply(lambda x:x[1]/100000000,axis=1).reset_index().rename({0:'Purchase'},axis=1)
df_Gender_Marital_Status_Age_Occupation=pd.merge(left=df_Gender_Marital_Status_Age_Occupation_left,right=df_Gender_Marital_Status_Age_Occupation,how="inner",on='index',suffixes=('','_del'))
df_Gender_Marital_Status_Age_Occupation=df_Gender_Marital_Status_Age_Occupation.sort_values(by='Purchase',ascending=False).head(30)
fig,ax=plt.subplots(figsize=(12,7))
rects=plt.bar(df_Gender_Marital_Status_Age_Occupation.Gender_Marital_Status_Age_Occupation,df_Gender_Marital_Status_Age_Occupation.Purchase)
ax.set_xticklabels(labels=df_Gender_Marital_Status_Age_Occupation.Gender_Marital_Status_Age_Occupation,rotation=90)
ax.set_ylim(0,4)
ax.set_title('细分性别-婚姻-年龄-职业总消费额')
ax.set_ylabel('亿元')
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.1f'%height, ha='center', va='bottom',fontsize=12)
plt.show()

df_Gender_Marital_Status_Age_Occupation=df.groupby("Gender_Marital_Status_Age_Occupation").Purchase.sum().reset_index().sort_values(by="Purchase")
df_Gender_Marital_Status_Age_Occupation.Gender_Marital_Status_Age_Occupation=df_Gender_Marital_Status_Age_Occupation.Gender_Marital_Status_Age_Occupation.values.astype('str')
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Gender_Marital_Status_Age_Occupation.Purchase,labels=df_Gender_Marital_Status_Age_Occupation.Gender_Marital_Status_Age_Occupation,autopct='%.0f%%',startangle=90)
ax.set_title("细分性别-婚姻-年龄-职业总消费额占比")
plt.show()

df_enjoy_product_1_count=df.groupby('Product_Category_1').Purchase.count().reset_index().sort_values(by='Purchase',ascending=True).rename({'Purchase':'Count'},axis=1)
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_enjoy_product_1_count.Count,labels=df_enjoy_product_1_count.Product_Category_1,autopct='%.2f%%',startangle=90)
ax.set_title("产品类别1热销品占比")
plt.show()

df_enjoy_product_1_sum=df.groupby('Product_Category_1').Purchase.sum().reset_index().sort_values(by='Purchase',ascending=True)
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_enjoy_product_1_sum.Purchase,labels=df_enjoy_product_1_sum.Product_Category_1,autopct='%.2f%%',startangle=90)
ax.set_title("产品类别1热销品占比")
plt.show()

df_enjoy_product_1_sum_top5=df_enjoy_product_1_sum.sort_values(by='Purchase',ascending=False).head(5)
df_enjoy_product_1_sum_last5=df_enjoy_product_1_sum.sort_values(by='Purchase',ascending=False).tail(5)
df_enjoy_product_1_count_top5=df_enjoy_product_1_count.sort_values(by='Count',ascending=False).head(5)
df_enjoy_product_1_count_last5=df_enjoy_product_1_count.sort_values(by='Count',ascending=False).tail(5)
df_enjoy_product_1=pd.merge(left=df_enjoy_product_1_sum_top5,right=df_enjoy_product_1_count_top5,how="inner",on='Product_Category_1',suffixes=('','_del'))
df_enjoy_product_1['Prep']=df_enjoy_product_1.apply(lambda x:int(x[1]/x[2]),axis=1)
df_enjoy_product_1=df_enjoy_product_1.sort_values(by='Purchase',ascending=False).head(3)
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_enjoy_product_1.Purchase,labels=df_enjoy_product_1.Product_Category_1,autopct='%.f%%',startangle=90)
ax.set_title('产品类别1(高销售额与销量高)的产品销售额占比')
plt.show()

df_enjoy_product_1=df_enjoy_product_1.sort_values(by='Count',ascending=False).head(3)
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_enjoy_product_1.Count,labels=df_enjoy_product_1.Product_Category_1,autopct='%.f%%',startangle=90)
ax.set_title('产品类别1(高销售额与销量高)的产品销售量占比')
plt.show()

df_hate_product_1=pd.merge(left=df_enjoy_product_1_sum_last5,right=df_enjoy_product_1_count_last5,how='inner',on='Product_Category_1',suffixes=('','_del'))
df_hate_product_1['Prep']=df_enjoy_product_1.apply(lambda x:int(x[1]/x[2]),axis=1)
print(df_hate_product_1.sort_values(by='Purchase',ascending=True))

七、完整代码*
#导入库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#设定绘图风格
plt.style.use('ggplot')
df=pd.read_csv(r'C:\Users\41174\Desktop\BlackFriday.csv')
print(df.head())
df.info()
df.isna().any()
missing_percentage = (df.isnull().sum()/df.shape[0]*100).sort_values(ascending=False)
missing_percentage = missing_percentage[missing_percentage!=0].round(2)
print(missing_percentage)
df.fillna(0,inplace=True) #缺失值使用 0 填充
df.isna().any().sum() #结果没有缺失值
print(df["Purchase"].sum() /100000000)#总的消费金额是50.17亿美元
print(df["Product_ID"].count()/10000) #所销售的产品数量53.76万
print(df["Purchase"].sum()/df["Product_ID"].count()) #平均每个产品的价格是9333美元
print(df["Purchase"].sum()/df["User_ID"].drop_duplicates(keep='first').count()) #平均客单价是85万美元
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
df_Gender=df.groupby("Gender").Purchase.sum().reset_index()
df_Gender=df_Gender.apply(lambda x:x[1]/100000000,axis=1).reset_index(drop=True).reset_index().rename({0:"Purchase"},axis=1)
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.bar(('F','M'),df_Gender.Purchase)
ax.set_xlabel("性别")
ax.set_ylabel('亿元')
ax.set_ylim((0,45))
ax.set_title("不同性别的总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12,)
plt.show()
df_Gender=df.groupby("Gender").Purchase.sum().reset_index()
df_Gender=df_Gender.apply(lambda x:x[1]/100000000,axis=1).reset_index(drop=True).reset_index().rename({0:"Purchase"},axis=1)
fig,ax=plt.subplots(figsize=(6,6))
labels=[u"F",u"M"]
rects=plt.pie(df_Gender.Purchase,labels=labels,autopct='%1.0f%%')
ax.set_title('不同性别消费占比')
plt.show()
df_Age=df.groupby("Age").Purchase.sum().reset_index()
df_Age1=df_Age.drop(['Purchase'],axis=1).reset_index()
df_Age=df_Age.apply(lambda x:x[1]/100000000,axis=1).reset_index(drop=True).reset_index().rename({0:"Purchase"},axis=1)
df_Age=pd.merge(left=df_Age1,right=df_Age,how="inner",on='index',suffixes=('','_del'))
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.bar(df_Age.Age,df_Age.Purchase)
ax.set_xlabel("年龄")
ax.set_ylabel('亿元')
ax.set_ylim((0,25))
ax.set_title("不同年龄的总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12,)
plt.show()
df_Age=df.groupby("Age").Purchase.sum().reset_index()
df_Age1=df_Age.drop(['Purchase'],axis=1).reset_index()
df_Age=df_Age.apply(lambda x:x[1]/100000000,axis=1).reset_index(drop=True).reset_index().rename({0:"Purchase"},axis=1)
df_Age=pd.merge(left=df_Age1,right=df_Age,how="inner",on='index',suffixes=('','_del'))
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.pie(df_Age.Purchase,labels=df_Age.Age,autopct='%.0f%%')
ax.set_title("不同年龄的总消费额占比")
df_Occupation=df.groupby("Occupation").Purchase.sum().reset_index()
df_Occupation=df_Occupation.apply(lambda x:x[1]/10000000,axis=1).reset_index(drop=True).reset_index().rename({'index':'Occupation',0:"Purchase"},axis=1)
fig,ax=plt.subplots(figsize=(6,6))
df_Occupation.Occupation=df_Occupation.Occupation.values.astype('str')
rects=plt.bar(df_Occupation.Occupation,df_Occupation.Purchase)
ax.set_xlabel("职业")
ax.set_ylabel('千万')
ax.set_ylim((0,80))
ax.set_title("不同职业的总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12)
plt.show()
df_Occupation=df.groupby("Occupation").Purchase.sum().reset_index().sort_values(by="Purchase")
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.pie(df_Occupation.Purchase,labels=df_Occupation.Occupation,autopct='%.0f%%',startangle=90)
ax.set_title("不同职业的总消费额占比")
df_City_Category=df.groupby("City_Category").Purchase.sum().reset_index()
df_City_Category=df_City_Category.apply(lambda x:x[1]/10000000,axis=1).reset_index(drop=True).reset_index().rename({'index':'City_Category',0:"Purchase"},axis=1)
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.bar(("A","B","C"),df_City_Category.Purchase)
ax.set_xlabel("城市")
ax.set_ylabel('亿元')
ax.set_ylim((0,250))
ax.set_title("不同城市的总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12)
plt.show()
df_City_Category=df.groupby("City_Category").Purchase.sum().reset_index().sort_values(by="Purchase")
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.pie(df_City_Category.Purchase,labels=df_City_Category.City_Category,autopct='%.0f%%')
ax.set_title("不同城市的总消费额占比")
df_Stay_In_Current_City_Years=df.groupby("Stay_In_Current_City_Years").Purchase.sum().reset_index()
df_Stay_In_Current_City_Years=df_Stay_In_Current_City_Years.apply(lambda x:x[1]/10000000,axis=1).reset_index(drop=True).reset_index().rename({'index':'Stay_In_Current_City_Years',0:"Purchase"},axis=1)
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.bar(df_Stay_In_Current_City_Years.Stay_In_Current_City_Years,df_Stay_In_Current_City_Years.Purchase)
ax.set_xlabel("年限")
ax.set_ylabel('亿元')
ax.set_ylim((0,200))
ax.set_title("不同年限的总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12)
plt.show()
df_Stay_In_Current_City_Years=df.groupby("Stay_In_Current_City_Years").Purchase.sum().reset_index().sort_values(by="Purchase")
df_Stay_In_Current_City_Years.Stay_In_Current_City_Years=df_Stay_In_Current_City_Years.Stay_In_Current_City_Years.values.astype('str')
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Stay_In_Current_City_Years.Purchase,labels=df_Stay_In_Current_City_Years.Stay_In_Current_City_Years,autopct='%.0f%%',startangle=90)
ax.set_title("不同年限的总消费额占比")
plt.show()
df_User_ID_count=df.groupby('User_ID').Purchase.count().reset_index().rename({'Purchase':'Count'},axis=1).sort_values(by='Count',ascending=False).head(10)
df_User_ID_sum=df.groupby('User_ID').Purchase.sum().reset_index().sort_values(by='Purchase',ascending=False).head(10)
df_User_ID=pd.merge(left=df_User_ID_sum,right=df_User_ID_count,how='inner',on='User_ID',suffixes=('','_del'))
df_User_ID['Prep']=df_User_ID.apply(lambda x:int(x[1]/x[2]),axis=1)
print(df_User_ID)
df_Marital_Status=df.groupby("Marital_Status").Purchase.sum().reset_index()
df_Marital_Status=df_Marital_Status.apply(lambda x:x[1]/10000000,axis=1).reset_index(drop=True).reset_index().rename({'index':'Marital_Status',0:"Purchase"},axis=1)
fig,ax=plt.subplots(figsize=(6,6))
df_Marital_Status.Marital_Status=df_Marital_Status.Marital_Status.values.astype('str')
rects=plt.bar(df_Marital_Status.Marital_Status,df_Marital_Status.Purchase)
ax.set_xlabel("婚否")
ax.set_ylabel('亿元')
ax.set_ylim((0,350))
ax.set_title("婚否的总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12)
plt.show()
df_Marital_Status=df.groupby("Marital_Status").Purchase.sum().reset_index().sort_values(by="Purchase")
df_Marital_Status.Marital_Status=df_Marital_Status.Marital_Status.values.astype('str')
df_Marital_Status['Marital_Status'][0]='未婚'
df_Marital_Status['Marital_Status'][1]='已婚'
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Marital_Status.Purchase,labels=df_Marital_Status.Marital_Status,autopct='%.0f%%',startangle=90)
ax.set_title("婚否的总消费额占比")
plt.show()
df_Product_Category_1=df.groupby("Product_Category_1").Purchase.sum().reset_index()
df_Product_Category_1=df_Product_Category_1.apply(lambda x:x[1]/10000000,axis=1).reset_index().rename({'index':'Product_Category_1',0:"Purchase"},axis=1)
for i in range(18):
df_Product_Category_1.Product_Category_1[i]=i+1
fig,ax=plt.subplots(figsize=(6,6))
df_Product_Category_1.Product_Category_1=df_Product_Category_1.Product_Category_1.values.astype('str')
df_Product_Category_1=df_Product_Category_1.sort_values(by='Purchase',ascending=False)
rects=plt.bar(df_Product_Category_1.Product_Category_1,df_Product_Category_1.Purchase)
ax.set_xlabel("产品类别1")
ax.set_ylabel('千万')
ax.set_ylim((0,220))
ax.set_title("产品类别1的各类总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12)
plt.show()
df_Product_Category_1=df.groupby("Product_Category_1").Purchase.sum().reset_index().sort_values(by="Purchase")
df_Product_Category_1.Product_Category_1=df_Product_Category_1.Product_Category_1.values.astype('str')
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Product_Category_1.Purchase,labels=df_Product_Category_1.Product_Category_1,autopct='%.0f%%',startangle=90)
ax.set_title("产品类别1的各类总消费额占比")
plt.show()
df_Product_Category_2=df[df.Product_Category_2>0].groupby("Product_Category_2").Purchase.sum().reset_index()
df_Product_Category_2=df_Product_Category_2.apply(lambda x:x[1]/10000000,axis=1).reset_index().rename({'index':'Product_Category_2',0:"Purchase"},axis=1)
for i in range(17):
df_Product_Category_2.Product_Category_2[i]=i+2
fig,ax=plt.subplots(figsize=(6,6))
df_Product_Category_2.Product_Category_2=df_Product_Category_2.Product_Category_2.values.astype('str')
df_Product_Category_2=df_Product_Category_2.sort_values(by='Purchase',ascending=False)
rects=plt.bar(df_Product_Category_2.Product_Category_2,df_Product_Category_2.Purchase)
ax.set_xlabel("产品类别2")
ax.set_ylabel('千万')
ax.set_ylim((0,80))
ax.set_title("产品类别2的各类总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12)
plt.show()
df_Product_Category_2=df[df.Product_Category_2>0].groupby("Product_Category_2").Purchase.sum().reset_index().sort_values(by="Purchase")
print(df_Product_Category_2)
df_Product_Category_2.Product_Category_2=df_Product_Category_2.Product_Category_2.values.astype('int')
print(df_Product_Category_2)
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Product_Category_2.Purchase,labels=df_Product_Category_2.Product_Category_2,autopct='%.0f%%',startangle=90)
ax.set_title("产品类别2的各类总消费额占比")
plt.show()
df_Product_Category_2=df[df.Product_Category_2>0].groupby("Product_Category_2").Purchase.sum().reset_index().sort_values(by="Purchase")
df_Product_Category_2.Product_Category_2=df_Product_Category_2.Product_Category_2.values.astype('int')
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Product_Category_2.Purchase,labels=df_Product_Category_2.Product_Category_2,autopct='%.0f%%',startangle=90)
ax.set_title("产品类别2的各类总消费额占比")
plt.show()
df_Product_Category_3=df[df.Product_Category_3>0].groupby("Product_Category_3").Purchase.sum().reset_index()
df_Product_Category_3=df_Product_Category_3.apply(lambda x:x[1]/10000000,axis=1).reset_index().rename({'index':'Product_Category_3',0:"Purchase"},axis=1)
df_Product_Category_3.Product_Category_3=df_Product_Category_3.Product_Category_3.values.astype('str')
for i in range(15):
if i<=3:
df_Product_Category_3.Product_Category_3[i]=str(i+3)
else:
df_Product_Category_3.Product_Category_3[i]=str(i+4)
df_Product_Category_3=df_Product_Category_3.sort_values(by='Purchase',ascending=False)
fig,ax=plt.subplots(figsize=(6,6))
rects=plt.bar(df_Product_Category_3.Product_Category_3,df_Product_Category_3.Purchase)
ax.set_xlabel("产品类别3")
ax.set_ylabel('千万')
ax.set_ylim((0,50))
ax.set_title("产品类别3的各类总消费额")
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.0f'%height, ha='center', va='bottom',fontsize=12)
plt.show()
df_Product_Category_3=df[df.Product_Category_3>0].groupby("Product_Category_3").Purchase.sum().reset_index().sort_values(by="Purchase")
df_Product_Category_3.Product_Category_3=df_Product_Category_3.Product_Category_3.values.astype('int')
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Product_Category_3.Purchase,labels=df_Product_Category_3.Product_Category_3,autopct='%.0f%%',startangle=90)
ax.set_title("产品类别2的各类总消费额占比")
plt.show()
df['Gender_Marital_Status_Age']=df.Gender
df.Marital_Status=df.Marital_Status.values.astype(str)
df['Gender_Marital_Status_Age']=df.Gender+'_'+df.Marital_Status+'_'+df.Age
df_Gender_Marital_Status_Age=df.groupby('Gender_Marital_Status_Age').Purchase.sum().reset_index()
df_Gender_Marital_Status_Age_left=df_Gender_Marital_Status_Age.drop(columns='Purchase').reset_index()
df_Gender_Marital_Status_Age=df_Gender_Marital_Status_Age.apply(lambda x:x[1]/100000000,axis=1).reset_index().rename({0:'Purchase'},axis=1)
df_Gender_Marital_Status_Age=pd.merge(left=df_Gender_Marital_Status_Age_left,right=df_Gender_Marital_Status_Age,how="inner",on='index',suffixes=('','_del'))
df_Gender_Marital_Status_Age=df_Gender_Marital_Status_Age.sort_values(by='Purchase',ascending=False)
fig,ax=plt.subplots(figsize=(12,7))
rects=plt.bar(df_Gender_Marital_Status_Age.Gender_Marital_Status_Age,df_Gender_Marital_Status_Age.Purchase)
ax.set_xticklabels(labels=df_Gender_Marital_Status_Age.Gender_Marital_Status_Age,rotation=90)
ax.set_ylim(0,12)
ax.set_title('细分性别-婚姻-年龄总消费额')
ax.set_ylabel('亿元')
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.2f'%height, ha='center', va='bottom',fontsize=12)
plt.show()
df_Gender_Marital_Status_Age=df.groupby("Gender_Marital_Status_Age").Purchase.sum().reset_index().sort_values(by="Purchase")
df_Gender_Marital_Status_Age.Gender_Marital_Status_Age=df_Gender_Marital_Status_Age.Gender_Marital_Status_Age.values.astype('str')
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Gender_Marital_Status_Age.Purchase,labels=df_Gender_Marital_Status_Age.Gender_Marital_Status_Age,autopct='%.0f%%',startangle=90)
ax.set_title("细分性别-婚姻-年龄总消费额占比")
plt.show()
df['Gender_Marital_Status_Age']=df.Gender
df.Marital_Status=df.Marital_Status.values.astype(str)
df.Occupation=df.Occupation.values.astype(str)
df['Gender_Marital_Status_Age_Occupation']=df.Gender+'_'+df.Marital_Status+'_'+df.Age+'_'+df.Occupation
df_Gender_Marital_Status_Age_Occupation=df.groupby('Gender_Marital_Status_Age_Occupation').Purchase.sum().reset_index()
df_Gender_Marital_Status_Age_Occupation_left=df_Gender_Marital_Status_Age_Occupation.drop(columns='Purchase').reset_index()
df_Gender_Marital_Status_Age_Occupation=df_Gender_Marital_Status_Age_Occupation.apply(lambda x:x[1]/100000000,axis=1).reset_index().rename({0:'Purchase'},axis=1)
df_Gender_Marital_Status_Age_Occupation=pd.merge(left=df_Gender_Marital_Status_Age_Occupation_left,right=df_Gender_Marital_Status_Age_Occupation,how="inner",on='index',suffixes=('','_del'))
df_Gender_Marital_Status_Age_Occupation=df_Gender_Marital_Status_Age_Occupation.sort_values(by='Purchase',ascending=False).head(30)
fig,ax=plt.subplots(figsize=(12,7))
rects=plt.bar(df_Gender_Marital_Status_Age_Occupation.Gender_Marital_Status_Age_Occupation,df_Gender_Marital_Status_Age_Occupation.Purchase)
ax.set_xticklabels(labels=df_Gender_Marital_Status_Age_Occupation.Gender_Marital_Status_Age_Occupation,rotation=90)
ax.set_ylim(0,4)
ax.set_title('细分性别-婚姻-年龄-职业总消费额')
ax.set_ylabel('亿元')
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height+0.2, '%.1f'%height, ha='center', va='bottom',fontsize=12)
plt.show()
df_Gender_Marital_Status_Age_Occupation=df.groupby("Gender_Marital_Status_Age_Occupation").Purchase.sum().reset_index().sort_values(by="Purchase")
df_Gender_Marital_Status_Age_Occupation.Gender_Marital_Status_Age_Occupation=df_Gender_Marital_Status_Age_Occupation.Gender_Marital_Status_Age_Occupation.values.astype('str')
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_Gender_Marital_Status_Age_Occupation.Purchase,labels=df_Gender_Marital_Status_Age_Occupation.Gender_Marital_Status_Age_Occupation,autopct='%.0f%%',startangle=90)
ax.set_title("细分性别-婚姻-年龄-职业总消费额占比")
plt.show()
df_enjoy_product_1_count=df.groupby('Product_Category_1').Purchase.count().reset_index().sort_values(by='Purchase',ascending=True).rename({'Purchase':'Count'},axis=1)
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_enjoy_product_1_count.Count,labels=df_enjoy_product_1_count.Product_Category_1,autopct='%.2f%%',startangle=90)
ax.set_title("产品类别1热销品占比")
plt.show()
df_enjoy_product_1_sum=df.groupby('Product_Category_1').Purchase.sum().reset_index().sort_values(by='Purchase',ascending=True)
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_enjoy_product_1_sum.Purchase,labels=df_enjoy_product_1_sum.Product_Category_1,autopct='%.2f%%',startangle=90)
ax.set_title("产品类别1热销品占比")
plt.show()
df_enjoy_product_1_sum_top5=df_enjoy_product_1_sum.sort_values(by='Purchase',ascending=False).head(5)
df_enjoy_product_1_sum_last5=df_enjoy_product_1_sum.sort_values(by='Purchase',ascending=False).tail(5)
df_enjoy_product_1_count_top5=df_enjoy_product_1_count.sort_values(by='Count',ascending=False).head(5)
df_enjoy_product_1_count_last5=df_enjoy_product_1_count.sort_values(by='Count',ascending=False).tail(5)
df_enjoy_product_1=pd.merge(left=df_enjoy_product_1_sum_top5,right=df_enjoy_product_1_count_top5,how="inner",on='Product_Category_1',suffixes=('','_del'))
df_enjoy_product_1['Prep']=df_enjoy_product_1.apply(lambda x:int(x[1]/x[2]),axis=1)
df_enjoy_product_1=df_enjoy_product_1.sort_values(by='Purchase',ascending=False).head(3)
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_enjoy_product_1.Purchase,labels=df_enjoy_product_1.Product_Category_1,autopct='%.f%%',startangle=90)
ax.set_title('产品类别1(高销售额与销量高)的产品销售额占比')
plt.show()
df_enjoy_product_1=df_enjoy_product_1.sort_values(by='Count',ascending=False).head(3)
fig,ax=plt.subplots(figsize=(6,6))
plt.pie(df_enjoy_product_1.Count,labels=df_enjoy_product_1.Product_Category_1,autopct='%.f%%',startangle=90)
ax.set_title('产品类别1(高销售额与销量高)的产品销售量占比')
plt.show()
df_hate_product_1=pd.merge(left=df_enjoy_product_1_sum_last5,right=df_enjoy_product_1_count_last5,how='inner',on='Product_Category_1',suffixes=('','_del'))
df_hate_product_1['Prep']=df_enjoy_product_1.apply(lambda x:int(x[1]/x[2]),axis=1)
print(df_hate_product_1.sort_values(by='Purchase',ascending=True))
八、备注
若有错误,还望指出,我会及时更新,谢谢!
网友评论