Datawhale组队学习之pandas期中

作者: YANJINING | 来源:发表于2021-01-01 22:25 被阅读0次

Datawhale组队学习之pandas期中
DataWhale组队学习之pandas基础
DataWhale组队学习之pandas预备知识：numpy,p
DataWhale组队学习之索引
Datawhale组队学习之连接
数据挖掘（异常检测）——线性方法
datawhale（2019暑期）组队学习资源汇总
数据挖掘（异常检测）——统计学方法之HBOS
数据挖掘（异常检测）——概述
DataWhale组队学习之缺失数据

参考瑞楠大佬的博客：https://blog.csdn.net/mintminty/article/details/111993371
练习题先自己思考了一下，有的题实在没想出来，看了一下大佬的博客，写的确实好，自愧不如！

image.png

import pandas as pd
import numpy as np
from copy import deepcopy

df1 = pd.read_csv('data/Company.csv')
df2 = pd.read_csv('data/Company_data.csv')
df2.columns = ['id', 'datetime', 'type', 'money']
print(df1.head())
print(df2.head())
print(df2.columns)
df2['datetime'] = pd.to_datetime(df2['datetime'])
print(df2.info())
df2['year'] = df2['datetime'].dt.year
print(df2.dtypes)
df2 = df2.sort_values(by='year')

# pd.set_option('max_rows', None)
df3 = df2.groupby(by=['year', 'type'])['money'] \
    .apply(lambda x: np.sum(x)).reset_index()
print(df3)
year_2014 = df3[df3['year'] == 2014]
year_2015 = df3[df3['year'] == 2015]
year_2016 = df3[df3['year'] == 2016]
year_2017 = df3[df3['year'] == 2017]
I_2014 = -sum(year_2014['money'] / sum(year_2014['money']) \
              * np.log(year_2014['money'] / sum(year_2014['money'])))
I_2015 = -sum(year_2015['money'] / sum(year_2015['money']) \
              * np.log(year_2015['money'] / sum(year_2015['money'])))
I_2016 = -sum(year_2016['money'] / sum(year_2016['money']) \
              * np.log(year_2016['money'] / sum(year_2016['money'])))
I_2017 = -sum(year_2017['money'] / sum(year_2017['money']) \
              * np.log(year_2017['money'] / sum(year_2017['money'])))
df1.columns = ['id', 'year']

print(df1)
df1['I'] = 0
df1 = deepcopy(df1)
df1_2014 = df1[df1['year'] == 2014]
df1_2014 = deepcopy(df1_2014)
df1_2014.loc[:, 'I'] = I_2014
print(df1_2014)
df1_2015 = df1[df1['year'] == 2015]
df1_2015 = deepcopy(df1_2015)
df1_2015.loc[:, 'I'] = I_2015

df1_2016 = df1[df1['year'] == 2016]
df1_2016 = deepcopy(df1_2016)
df1_2016.loc[:, 'I'] = I_2016
df1_2017 = df1[df1['year'] == 2017]
df1_2017 = deepcopy(df1_2017)
df1_2017.loc[:, 'I'] = I_2017

df = df1_2014.append([df1_2015, df1_2016, df1_2017])
df.index = range(df.shape[0])
print(df)

image.png

import pandas as pd
import numpy as np

df = pd.read_excel('data/组队信息汇总表（Pandas）.xlsx', engine='openpyxl')
df = df.drop(['所在群'], axis=1)
pd.set_option('max_rows', None)
pd.set_option('max_columns', 60)
print(df.head())
df_group = df.groupby('队长编号', group_keys=True)[['队伍名称', '队长_群昵称', '队员_群昵称']] \
    .apply(lambda x: x).dropna().reset_index(drop=True)
print(df_group)
# for name, group in df_group:
#     print(name)
#     print(group.dropna(axis=1))

想了半天没想出来，希望用分组的方法依次处理，但是代码没有按照预期实现，反复试验还是没有成功，明天再接着研究，下面是瑞楠大佬的解题思路，写的很棒

data = pd.read_csv('data/组队信息汇总表.csv')
data

filename = open('data/组队信息汇总表.csv')
data = pd.read_csv(filename,encoding='utf-8')
data = data.drop('所在群',1)
a = [['编号_'+str(x), '昵称_'+str(x)] for x in range(11)] 
columns_name = sum(a, []) 
columns_name = ['队伍名称'] + columns_name
data.columns = columns_name
data.head()
res = pd.wide_to_long(data.reset_index(), stubnames=['昵称','编号'], i='队伍名称',j='成员',sep='_',suffix='.+').dropna().reset_index()
res['成员'] = res.成员.values + 1
res.loc[res['成员']!=1,'成员'] = 0
res = res.sort_values(['队伍名称','成员'],ascending=False).reset_index()
res = res.drop(['level_0','index'],1)
res.rename(columns={'成员':'是否队长'})

image.png

president_county_candidate = pd.read_csv('data/president_county_candidate.csv')
president_county_candidate['得票率'] = president_county_candidate.groupby(['state','county'])['total_votes'].apply(lambda x: x/x.sum())
president_county_candidate.sort_values('state')
res = president_county_candidate.pivot(index=['state','county'],columns='candidate',values='得票率')
res['BT'] = res['Joe Biden']-res['Donald Trump']
res = res.reset_index()
res = res.groupby(['state'])['BT'].apply(lambda x:x.median()>0).to_frame()
res[res.BT==True].index
Index(['California', 'Connecticut', 'Delaware', 'District of Columbia',
       'Hawaii', 'Massachusetts', 'New Jersey', 'Rhode Island', 'Vermont'],
      dtype='object', name='state')