airline

作者: Christa_257b | 来源:发表于2020-03-17 00:27 被阅读0次

    import pandas as pd

    import numpy as np

    import matplotlib.pyplot as plt

    import seaborn as sns

    %matplotlib inline

    plt.rcParams['font.sans-serif']=['SimHei']

    plt.rcParams['axes.unicode_minus']=False

    df=pd.read_csv('C:/Users/xh/Desktop/air_data.csv')

    df.info()

    columns=df.columns

    for i in columns:

        if df[i].isnull().sum()==0:

            pass

        else:

            print(i,df[i].isnull().sum())

    df.loc[df['WORK_PROVINCE']=='北京','WORK_CITY'] = '北京'

    df.loc[df['WORK_PROVINCE']=='上海','WORK_CITY'] = '上海'

    df.loc[df['WORK_CITY']=='北京','WORK_PROCINCE'] = '北京'

    df.loc[df['WORK_CITY']=='上海','WORK_PROCINCE'] = '上海'

    df['WORK_CITY'].isnull().sum()

    df=df.iloc[:,:-1]

    df.describe()

    df['FFP_DATE']=pd.to_datetime(df['FFP_DATE'])

    df['MONTH']=df['FFP_DATE'].values.astype('datetime64[M]')

    bins=[0,10,20,30,40,50,60,110]

    df['AGE_CUT']=pd.cut(df['AGE'],bins=bins,labels=['10岁以下','10-20','20-30','30-40','40-50','50-60','60+'])

    plt.figure(figsize=(12,8))

    plt.subplot(221)

    df.groupby('MONTH').MEMBER_NO.count().plot()

    plt.subplot(222)

    df.groupby('GENDER').MEMBER_NO.count().plot(kind='bar')

    plt.subplot(223)

    df.groupby('AGE_CUT').MEMBER_NO.count().plot(kind='bar')

    plt.subplot(224)

    df.groupby('FFP_TIER').MEMBER_NO.count().plot(kind='bar')

    df[df['WORK_COUNTRY']=='CN']['WORK_COUNTRY'].count()/df['WORK_COUNTRY'].count()

    member=df[['MEMBER_NO','FFP_TIER','SUM_YR_1','SUM_YR_2']]

    member=member.dropna()

    g=sns.FacetGrid(member,col='FFP_TIER')

    g.map(plt.scatter,'MEMBER_NO','SUM_YR_1')

    g=sns.FacetGrid(member,col='FFP_TIER')

    g.map(plt.scatter,'MEMBER_NO','SUM_YR_2')

    plt.figure(figsize=(5,5))

    plt.subplot(131)

    member[(member['FFP_TIER']==4)|(member['SUM_YR_2']>0)]['SUM_YR_2'].hist(bins=50)

    plt.subplot(132)

    member[(member['FFP_TIER']==5)|(member['SUM_YR_2']>0)]['SUM_YR_2'].hist(bins=50)

    plt.subplot(133)

    member[(member['FFP_TIER']==6)|(member['SUM_YR_2']>0)]['SUM_YR_2'].hist(bins=50)

    df.groupby('FFP_TIER').FLIGHT_COUNT.mean()

    df.groupby('FFP_TIER').AVG_INTERVAL.mean()

    data1['AVG_INTERVAL'].mean()

    ffp=member['FFP_TIER'].value_counts().index

    for i in ffp:

        print(member[member['FFP_TIER']==i]['FFP_TIER'].count()/member['FFP_TIER'].count())

    for i in ffp:

        print(member[member['FFP_TIER']==i]['SUM_YR_2'].sum()/member['SUM_YR_2'].sum())#

    memb=member.groupby('MEMBER_NO').SUM_YR_2.sum().sort_values().reset_index()

    memb['CUMSUM']=memb.SUM_YR_2.cumsum()

    memb.tail()

    total=max(memb.CUMSUM)

    memb['P']=memb['CUMSUM']/total

    memb.P.plot()

    sns.set_style('darkgrid')

    plt.figure(figsize=(25,15))

    data=df.dropna()

    column=data.columns.tolist()

    corr=data[column].corr()

    zero=np.zeros_like(corr,dtype=np.bool)

    zero[np.triu_indices_from(zero)]=True

    sns.heatmap(corr,mask=zero,square=True,annot=True,fmt='.2f')

    data1=df.dropna(subset=['SUM'])

    data1=data1[data1['SUM']>0]

    data=data1[['LOAD_TIME','FFP_DATE','LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM','avg_discount']]

    data['LOAD_TIME']=pd.to_datetime(data['LOAD_TIME'])

    data['L']=(data['LOAD_TIME']-data['FFP_DATE'])/np.timedelta64(1,'M')

    data['R']=data['LAST_TO_END']

    data['F']=data['FLIGHT_COUNT']

    data['M']=data['SEG_KM_SUM']

    data['C']=data['avg_discount']

    da=data.iloc[:,6:]

    da['L']=round(da['L'],2)

    da['L']=da['L'].astype('int')

    da=(da-da.mean(axis=0))/da.std(axis=0)

    da=np.array(da)

    from sklearn.cluster import KMeans

    k=5

    kmean=KMeans(n_clusters=k)

    kmean.fit(da)

    kmean.cluster_centers_#查看聚类中心

    print(kmean.labels_)

    label=list(kmean.labels_)

    label=pd.value_counts(label)

    pd.value_counts(label).plot(kind='bar',colors=color)

    k=5

    center=kmean.cluster_centers_

    color=['r','g','b','c','y']

    x=[1,2,3,4,5]

    for i in range(5):

        plt.plot(x,center[i],label='cluster'+str(i)+'  '+str(label[i]),color=color[i],marker='o')

    plt.xlabel('LRFMC')

    plt.legend()

    plt.show()

    相关文章

      网友评论

          本文标题:airline

          本文链接:https://www.haomeiwen.com/subject/itvaehtx.html