Python Data Anlysis Notebook
SublimeText File
Data Frame
-
read file
pd.read_csv(path,index_col=0,chunksize=1000) np.shape()#读取矩阵维度 pd.DataFrame() df[['var','var']] #-->dataframe df[1:4] df.head() #无数据默认前五条 #how to refer col and row df.loc[label] df.loc[[row],[col]] #-->dataframe df.iloc[index]
-
iterating over data
result=[] total=0 for chunck in pd.read_csv(path,chunksize=1000): result.append(sum(chunck['xcolumn'])) #另一种方法 total+=sum(chunck['x']) total=sum(result)
-
NumPy
np.logical_and(x,y) np.logical_or() np.logical_not()
-
Loop
enumerate(iterable) #--> index of the value for index,value in enumerate(iterable,start=10): print(index,value) loop over a dictionary: dic={} for key,value in dic.items(): print(key,value) import numpy as np array1=np.array() array2=np.array() array_total=np.array([array1,array2]) for val in np.nditer(array_total): print(val) import pandas as pd df=pd.read.csv() for label,row in df.iterrows(): print(label) print (row) add a new column in to DataFrame for lab,row in df.iterrows(): df.loc[lab,'name_length']=len(df['country']) print(df) #更简单的方法是使用apply function df['name_length']=df['country'].apply(len) apply(str.upper) pseudo-random import numpy as np np.random.seed(123) coin=np.random.randint(0,2)
''' 函数将一个数据集合(链表,元组等)中的所有数据进行下列操作: 用传给 reduce 中的函数 function(有两个参数)先对集合中的第 1、2 个元素进行操作, 得到的结果再与第三个数据用 function 函数运算,最后得到一个结果。 ''' from functools import reduce reduce(function,iterable) ''' map() 会根据提供的函数对指定序列做映射。 第一个参数 function 以参数序列中的每一个元素调用 function 函数, 返回包含每次 function 函数返回值的新列表。 ''' map(function,iterable,...) ''' filter() 函数用于过滤序列,过滤掉不符合条件的元素,返回由符合条件元素组成的新列表。 该接收两个参数,第一个为函数,第二个为序列, 序列的每个元素作为参数传递给函数进行判,然后返回 True 或 False,最后将返回 True 的元素放到新列表中。 ''' filter(function,iterable)
def func(): try: return function except [TypeError]: return print() # 抛出异常 raise ValueError
Iterators
#iterating over iterables
word='DA'
it=iter(word)
next(it)
print(*it) #->once per time
zip() #-> return an iterator of tuples
list(zip(lst1,lst2))
print(*zipvarable)
-
List comprehensions
lst=[1,2,3,4] lst_add_1=[num+1 for num in lst] lst=[num for num in range(11)] conditionals in comprehensions [num**2 for num in range(10) if num%2==0] dictionary comprehensions {num:-num for num in range(10)} generator->analogous generator object||iterator (num**2 for num in range(10) if num%2==0) generator function->yield def num_sequence(n): i=0 while i<n: yield i entryi+=1
Importing Data in python
file_name=open('file.txt',mode='r') #only to read
#'w' is to write
text=file_name.read()
file_name.close()
with open('datacamp.csv','r') as datacamp #context manager
file_objeect.readline()
! ls #will display the contents of your current directory.
-
flat files (contrain record,row of fields or attributes, column is feature or attributes)
#can have a header #numpy:storing numerical data import numpy as np filename='MNIST.txt' data=np.loadtxt(filename,delimiter=',',skiprows=1,usecols=[0,2],dtype=str) #跳过第一行 #只要第一和三列数据 #import with different type of data np.genfromtext(filename,delimiter=',',names=True,dtype=None) import pandas as pd filename='winequality-red.csv' data=pd.read_csv(filename,nrows=5,header=None)#first 5 rows of the file,there is no header in this file #convert dataframe to a numpy array data_array=data.values
-
Other files
data=pd.ExcelFile(filename) print(data.sheet_names)#figure sheet name out df1=data.parse('sheetname')#sheet name df2=data.parse(0,skiprows=[0],parse_cols[0],names=['Country'])#sheet index #sas file import pandas as pd from sas7bdat import SAS7DBAT with sas7bdat('file.sas7bdat') as file: df_sas=file.to_data_drame() #stata file_name import pandas as pd data=pd.read_stata('file.dta') #HDF5 file import h5py file_name='file.hdf5' data=h5py.File(file_name,'r') for key in data.keys(): print(key) #matlab file import scipy.io scipy.io.loadmat() #- read.mat files scipy.io.savemet() #- write.mat files
-
Relational Database
# Open engine in context manager # Perform query and save results to DataFrame: df engine = create_engine('sqlite:///Chinook.sqlite') with engine.connect() as con: rs = con.execute("SELECT LastName, Title FROM Employee") df = pd.DataFrame(rs.fetchmany(size=3)) df.columns = rs.keys() # Print the length of the DataFrame df print(len(df)) # Print the head of the DataFrame df print(df.head()) # Import packages from sqlalchemy import create_engine import pandas as pd # Create engine: engine engine = create_engine('sqlite:///Chinook.sqlite') # Execute query and store records in DataFrame: df df = pd.read_sql_query("SELECT * From Album", engine) # Print head of DataFrame print(df.head())
enumerate #返回数据和其位置
test=[1,2,3,4,5]
for i, num in enumerate(test):
print (i,m)
Clean Data
1. Sublime笔记整理
# Print the head of df
print(df.head())
# Print the tail of df
print(df.tail())
# Print the shape of df
print(df.shape)
# Print the columns of df
print(df.columns)
# Rename the columns
gapminder_melt.columns = ['country', 'year', 'life_expectancy']
# Print the info of df,provides important information about a DataFrame
print(df.info())
# Print the value_counts for 'State'
print(df.State.value_counts(dropna=False))
# Print the value counts for 'Site Fill'
print(df['Site Fill'].value_counts(dropna=False))
-
Transpose
# Melt airquality: airquality_melt,The id_vars represent the columns of the data you do not want to melt (i.e., keep it in its current shape), # while the value_vars represent the columns you do wish to melt into rows. airquality_melt = pd.melt(frame=df,id_vars=['Month','Day'], value_vars=['Ozone','Solar.R','Wind','Temp'],var_name='measurement', value_name='reading')
-
Pivot_Table
#(非重复数据) # Pivot airquality_melt: airquality_pivot airquality_pivot = airquality_melt.pivot_table(index=['Month','Day'], columns='measurement', values='reading') #(重复数据) # Pivot table the airquality_dup: airquality_pivot airquality_pivot = airquality_dup.pivot_table(index=['Month', 'Day'], columns='measurement', values='reading', aggfunc=np.mean)
# Print the index of airquality_pivot print(airquality_pivot.index) # Reset the index of airquality_pivot: airquality_pivot_reset airquality_pivot_reset = airquality_pivot.reset_index()
-
Melting and Parsing
# Melt tb: tb_melt tb_melt = pd.melt(frame=tb,id_vars=['country', 'year']) # Create the 'gender' column tb_melt['gender'] = tb_melt.variable.str[0] # Melt ebola: ebola_melt ebola_melt = pd.melt(ebola, id_vars=['Date', 'Day'], var_name='type_country', value_name='counts') # Create the 'str_split' column ebola_melt['str_split'] = ebola_melt.type_country.str.split('_') # Create the 'type' column ebola_melt['type'] = ebola_melt['str_split'].str.get(0) # Create the 'country' column ebola_melt['country'] = ebola_melt['str_split'].str.get(1)
-
Concatenation Data
# Concatenate uber1, uber2, and uber3: row_concat row_concat = pd.concat([uber1,uber2,uber3]) # Concatenate ebola_melt and status_country column-wise: ebola_tidy ebola_tidy = pd.concat([ebola_melt,status_country],axis=1) #Tons of files to concat # Import necessary modules import glob import pandas as pd # Write the pattern: pattern pattern = '*.csv' # Save all file matches: csv_files csv_files = glob.glob(pattern) # Print the file names print(csv_files) # Load the second file into a DataFrame: csv2 csv2 = pd.read_csv(csv_files[1]) # Print the head of csv2 print(csv2.head())
2. 函数
df.sample() 用法参考
df.sample(frac=0.5,replace=True,random_state=123) #按比例抽取,是否有序放回,设置随机种子
df.isnull()
df.isnull().values.any() #.values返回array,.any()返回True False
df.isnull().sum() #sum null
df.groupby()
#groupby后面聚类算法__猜测只能为num__
df.groupby('col_name').sum()
Pandas Foundations
1. Inspectiong Data
import pandas as pd
type(df)
type(df.columns)
type(df.index)
df.describe()
df.shape
df.columns
df.index
df.iloc[:5,:]
df.corr() #only numbers
# broadcasting, assigning scala value to col slice broadcasts value to each rows
df.iloc[::3,-1]=np.nan
#Series
low=df['Low']
type(low)
low.head()
lows=low.values
type(lows)
#View the first few and last few rows of a DataFrame
df.head()
df.tail()
count values
df['col_name'].value_counts()
df.col_name.value_counts()
show unique value
df['col_name'].unique()
df.col_name.unique()
2. Numpy and Pandas Together
import numpy as np
#.values to represent a DataFrame df as a NumPy array.
np_vals = df.values
# np.log10() method to compute the base 10 logarithm
np_vals_log10 = np.log10(np_vals)
df_log10 = np.log10(df)
[print(x, 'has type', type(eval(x))) for x in ['np_vals', 'np_vals_log10', 'df', 'df_log10']]
3. Zip list to build a df
#Zip the 2 lists together into one list of (key,value) tuples: zipped
zipped = list(zip(list_keys,list_values))
print(zipped)
data = dict(zipped)
df = pd.DataFrame(data)
print(df)
-
Labeling Data
list_labels = ['year', 'artist', 'song', 'chart weeks'] # Assign the list of labels to the columns attribute: df.columns df.columns=list_labels
4. Reading & Saving
-
Reading a flat file
df=pd.read_csv(file_name,header=None,names=col_name,na_values={'column_name':[' -1']},parse_date=[[0,1,2]]) df.index=df['date_time'] df.index.name='key_date' #trimming redundant columns cols=['sunspots','definite'] df=df[cols]
# header=0 to rename the column labels new_labels = ['year','population'] df2 = pd.read_csv(data_file, header=0, names=new_label
-
Delimiters, headers, and extensions
df2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#') # Save the cleaned up DataFrame to a CSV file without the index df2.to_csv(file_clean, index=False) # Save the cleaned up DataFrame to an excel file without the index df2.to_excel('file_clean.xlsx', index=False)
5. Plot
-
Ployting series using pandas
# Create a plot with color='red' df.plot(color='red') # Add a title plt.title('Temperature in Austin') # Specify the x-axis label plt.xlabel('Hours since midnight August 1, 2010') # Specify the y-axis label plt.ylabel('Temperature (degrees F)') # Display the plot plt.show()
#fix scales plt.yscale('log') #legend df.plot(legend=True) #axis plt.axis(('2001','2002',0,100)) #saving plot plt.savefig()
-
Plotting DataFrame
# Plot all columns (default) df.plot() plt.show() # Plot all columns as subplots df.plot(subplots=True) plt.show() # Plot just the Dew Point data column_list1 = ['Dew Point (deg F)'] df[column_list1].plot() plt.show() # Plot the Dew Point and Temperature data, but not the Pressure data column_list2 = ['Temperature (deg F)','Dew Point (deg F)'] df[column_list2].plot() plt.show()
-
Plots
# s for size,c for color,alpha for opacity df.plot(kind='scatter',x='HP',y='MPG',s=sizes,alpha=,c=) df[cols].plot(kind='box',subplots=True) # Put the x-axis on a logarithmic scale plt.xscale('log')
# Create a list of y-axis column names: y_columns y_columns = ['AAPL','IBM'] df.plot(x='Month', y=y_columns) plt.title('Monthly stock prices') plt.ylabel('Price ($US)') plt.show() plt.hist(x,bins=int) # clears the entire current figure与其所有轴,但是窗口打开,这样它可以被重复使用其他地块。 plt.clf()
#plotting the PDF, only normed=True; CDF, cumulative=True in addition to normed=True # This formats the plots such that they appear on separate rows fig, axes = plt.subplots(nrows=2, ncols=1) # Plot the PDF df.fraction.plot(ax=axes[0], kind='hist', normed=True,bins=30, range=(0,.3)) plt.show() # Plot the CDF df.fraction.plot(ax=axes[1], kind='hist', normed=True, bins=30, cumulative=True, range=(0,.3)) plt.show()
- Appendix
#Histogram # Import matplotlib.pyplot import matplotlib.pyplot as plt # Describe the column df['Existing Zoning Sqft'].describe() # Plot the histogram df['Existing Zoning Sqft'].plot(kind='hist', rot=70, logx=True, logy=True) #Boxplot # Create the boxplot df.boxplot(column='initial_cost', by='Borough', rot=90) #Scatter Plot # Create and display the first scatter plot df.plot(kind='scatter', x='initial_cost', y='total_est_fee', rot=70)
6. Statistical exploratory data analysis
-
pandas和numpy涉及到axis参数的问题
#summary stat of numerical cols of DataFrame df.describe() df.max() df.min() df.quantile([0.05,0.95]) df.std() #axis='columns' computes the mean across all columns per row. mean = df.mean(axis='columns') mean.plot()
-
Separating populations with Boolean indexing
df['col'].unique() df[df['origin'] == 'US'].count()
-
小case
# Display the box plots on 3 separate rows and 1 column fig, axes = plt.subplots(nrows=3, ncols=1) # Generate a box plot of the fare prices for the First passenger class titanic.loc[titanic['pclass'] == 1].plot(ax=axes[0], y='fare', kind='box') # Generate a box plot of the fare prices for the Second passenger class titanic.loc[titanic['pclass'] == 2].plot(ax=axes[1], y='fare', kind='box') # Generate a box plot of the fare prices for the Third passenger class titanic.loc[titanic[ 'pclass']==3].plot(ax=axes[2], y='fare', kind='box') # Display the plot plt.show()
-
7. Time Series
-
Creating DatetimeIndex
# Prepare a format string: time_format time_format = '%Y-%m-%d %H:%M' # Convert date_list into a datetime object: my_datetimes my_datetimes = pd.to_datetime(date_list, format=time_format) # Construct a pandas Series using temperature_list and my_datetimes: time_series time_series = pd.Series(temperature_list, index=my_datetimes)
-
Extracting data using datetime range
ts1 = ts0.loc['2010-10-11 21:00:00':'2010-10-11 22:00:00'] ts2 = ts0.loc['2010-07-04'] ts3 = ts0.loc['2010-12-15':'2010-12-31']
-
Reindexing
Input | Description |
---|---|
'min','T' | minute |
'H' | hour |
'D' | day |
'B' | business day |
'W' | week |
'M' | month |
'Q' | quarter |
'A' | year |
-
Resampling time series
Downsampling: reduce datetime rows to slower freq. day->week
Upsampling: increase datetime rows to faster freq. day->hourdf.resample('D').mean() df.resample('4H').ffill()
8. 防盗
Seaborn 参考
-
Format Setting
有5个seaborn的主题
- darkgrid 黑色网格(默认)
- whitegrid 白色网格
- dark 黑色背景
- white 白色背景
- ticks 四周都有刻度线的白背景sns.set_style("whitegrid")
画布大小
plt.figure(figsize=(7,7))
坐标轴倾斜45°
plt.xticks(rotation=45)
-
Bar Plot
seaborn.barplot (x=None, y=None, hue=None, data=None, order=None, hue_order=None, estimator=, ci=95, n_boot=1000, units=None, orient=None, color=None, palette=None, saturation=0.75, errcolor='.26', errwidth=None, capsize=None, dodge=True, ax=None, **kwargs)
- x,y,hue : names of variable in data or vector data
- data : DataFrame,array or list of array,optional
- color :matplotlib color,optional
- palette : palette name,list, or dict,optional 参考调色板教程
- ax : matplotlib Axes,optional
- hue: 类似于label#hue也可以换为['female','male'],palette=sns.cubehelix_palette(120) ax=sns.barplot(x=data.gender.value_counts().index,y=data.gender.value_counts(), hue=data['gender'].value_counts().index,palette="Greens_d") plt.legend(loc=8) plt.xlabel('Gender') plt.ylabel('Frequency') #另一种写法ax.set_ylabels() plt.title('Show of Gender Bar Plot') plt.show()
-
Catplot 参考文章 (理解为高级汇总,kind='types')
plt.figure(figsize=(10,10)) g=sns.catplot(x='gender',y='math score',data=data,hue='lunch',kind='bar',height=4, aspect=1) g.despine(left=True) #移除画布框线 plt.tight_layout() #紧缩排列 plt.show()
f,ax=plt.subplots(figsize=(9,10)) sns.barplot(y=data.gender.value_counts().index,x=data.gender.value_counts() ,label='gender',alpha=0.5,color='red') sns.barplot(y=data['race/ethnicity'].value_counts().index,x=data['race/ethnicity'].value_counts(), color='blue',label='race',alpha=0.7) ax.legend(loc='upper right',frameon=True) ax.set(xlabel='Gender , Race/Ethnicity',ylabel='Groups',title="Gender vs Race/Ethnicity ") plt.show()
-
Point Plot
seaborn.pointplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None, estimator=, ci=95, n_boot=1000, units=None, markers='o', linestyles='-', dodge=False, join=True, scale=1, orient=None, color=None, palette=None, errwidth=None, capsize=None, ax=None, **kwargs)
- x, y, hue : names of variables in data or vector data, optional
- data : DataFrame, array, or list of arrays, optional
- order, hue_order : lists of strings, optional
- markers : string or list of strings, optional
- linestyles : string or list of strings, optional
- color : matplotlib color, optional
- palette : palette name, list, or dict, optional
- ax : matplotlib Axes, optionalf,ax1=plt.subplots(figsize=(25,10)) sns.pointplot(x=np.arange(1,191), y=data[(data['Race/Ethnicity']=='group B')].Math_Score,color='blue',alpha=0.6) sns.pointplot(x=np.arange(1,191),y=data[(data['Race/Ethnicity']=='group B')].Reading_Score,color='pink',alpha=0.2) ax1.set(xlabel='Group B index State',ylabel='Frequency',title="Group B Math Score & Reading_Score") #也可以用plt.xlabel plt.xticks(rotation=90) #轴旋转 plt.grid() #加网线 plt.show()
ax=sns.pointplot(x='Writing_Score',y='Math_Score',hue='Gender',data=data,markers=["o", "x"],linestyles=["-", "--"]) plt.legend(loc='lower right') plt.xticks(rotation=90) plt.show()
-
Joint Plot 包含核密度估计 KDE图
seaborn.jointplot(x, y, data=None, kind='scatter', stat_func=None, color=None, height=6, ratio=5, space=0.2, dropna=True, xlim=None, ylim=None, joint_kws=None, marginal_kws=None, annot_kws=None, **kwargs)
- x, y : strings or vectors
- data : DataFrame, optional
- kind : { “scatter” | “reg” | “resid” | “kde” | “hex” }, optional
- color : matplotlib color, optional
- dropna : bool, optionalplt.figure(figsize=(10,10)) #kind='kde' 核密度图 sns.jointplot(x=np.arange(1,191),y=data[(data['Race/Ethnicity']=='group B')].Math_Score,color='lime',kind='hex',alpha=0.8) plt.xlabel('Group B index State') plt.ylabel('Frequency') plt.title('Group B Frequency Race/Ethnicity') plt.xticks(rotation=90) plt.tight_layout() plt.show()
#散点图+KDE图 data['Race/Ethnicity'].unique() len(data[(data['Race/Ethnicity']=='group B')].Math_Score) plt.figure(figsize=(10,10)) sns.jointplot(x=np.arange(1,191),y=data[(data['Race/Ethnicity']=='group B')].Reading_Score,color='k').plot_joint(sns.kdeplot, zorder=0, n_levels=6) plt.xlabel('Group B index State') plt.ylabel('Frequency') plt.title('Group B Math Score & Reading_Score') plt.xticks(rotation=90) plt.show()
-
KDE Plot (jointplot中可以kind='kde')
seaborn.kdeplot(data, data2=None, shade=False, vertical=False, kernel='gau', bw='scott', gridsize=100, cut=3, clip=None, legend=True, cumulative=False, shade_lowest=True, cbar=False, cbar_ax=None, cbar_kws=None, ax=None, **kwargs)
- data : 1d array-like
- data2: 1d array-like, optional
- shade : bool, optional
- vertical : bool, optional
- kernel : {‘gau’ | ‘cos’ | ‘biw’ | ‘epa’ | ‘tri’ | ‘triw’ }, optional
- cut : scalar, optional
- legend : bool, optional
- ax : matplotlib axes, optional -
Pie chart
plt.figure(figsize=(7,7)) #explode是偏离轴心(可小数),shadow是阴影 plt.pie(data['Race/Ethnicity'].value_counts().values,explode=[0,0,0.0,0,0.1],labels=data['Race/Ethnicity'].value_counts().index,colors=['pink','grey','yellow','lime','brown'],autopct='%1.1f%%',shadow=True) plt.title('Race/Ethnicity According Analysis',color='black',fontsize=10) plt.legend(data['Race/Ethnicity'].value_counts().index,loc=2) plt.axis('equal') #避免比例压缩为椭圆 plt.show()
-
LM Plot
seaborn.lmplot(x, y, data, hue=None, col=None, row=None, palette=None, col_wrap=None, height=5, aspect=1, markers='o', sharex=True, sharey=True, hue_order=None, col_order=None, row_order=None, legend=True, legend_out=True, x_estimator=None, x_bins=None, x_ci='ci', scatter=True, fit_reg=True, ci=95, n_boot=1000, units=None, order=1, logistic=False, lowess=False, robust=False, logx=False, x_partial=None, y_partial=None, truncate=False, x_jitter=None, y_jitter=None, scatter_kws=None, line_kws=None, size=None)
- x, y : strings,options
- data : DataFrame, optional
- hue, col, row : strings
- palette : palette name, list, or dict, optional
- markers : matplotlib marker code or list of marker codes, optional
- legend : bool, optional
- scatter : bool, optionalsns.lmplot(x='Math_Score',y='Writing_Score',data=data,hue='Gender',markers=['x','o']) plt.ylabel('Writing Score') plt.xlabel('Math Score') plt.show()
Practices
-
练习resampling
import numpy as ny import pandas as pd import matplotlib.pyplot as plt df=pd.DataFrame(pd.read_csv('test.csv')) #给特定列重命名 df=df.rename(columns ={'_c1':'date'}) #删除where col_name is null的rows df1=df[df['date'].notnull()] #将目标列设置为index df1.set_index(['date'],inplace=True) #resample只能适用于datetimeindex,因此将index转为datetimeindex格式 df1.index=pd.DatetimeIndex(df1.index) #对第二列开始的数据进行分类求和 df1.iloc[:,1:].resample('W').sum() #对id分组再进行resampling df1.groupby('id').resample('W').sum().iloc[:,1:]
网友评论