Creating DataFrames from scratch
import pandas as pd
data = {
'apples': [3, 2, 0, 1],
'oranges': [0, 3, 7, 2]
}
purchases = pd.DataFrame(data)
purchases = pd.DataFrame(data, index=['June', 'Robert', 'Lily', 'David'])
purchases
purchases.loc['June']
Reading data from CSVs
df = pd.read.csv('purchases.csv')
df
df=pd.read.csv('purchases.csv', index=0)
df
Reading data from JSON
df = pd.read_json('purchases.json')
Reading data from a SQL database
pip install pysqlite3
import sqlite3
con = sqlite3.connect("database.db")
df = pd.read_sql_query("SELECT * FROM purchases", con)
df = df.set_index('index')
Converting back to a CSV, JSON, or SQL
df.to_csv('new_purchases.csv')
df.to_json('new_purchases.json')
df.to_sql('new_purchases', con)
Most important DataFrame operations
movies_df = pd.read_csv("IMDB-Movie-Data.csv", index_col="Title")
movies_df.head( )
movies_df.tail(2)
movies_df.info()
movies_df.shape
Handling duplicates
temp_df = movies_df.append(movies_df)
temp_df = temp_df.drop_duplicates()
temp_df.drop_duplicates(inplace=True) #temp_df直接被改变。
temp_df = movies_df.append(movies_df) # make a new copy
temp_df.drop_duplicates(inplace=True, keep=False)#keep去掉了存在重复的行,所以数据框变空了。
temp_df.shape
Column cleanup
movies_df.columns#打印列名
movies_df.rename(columns={
'Runtime (Minutes)': 'Runtime',
'Revenue (Millions)': 'Revenue_millions'
}, inplace=True)
How to work with missing values
movies_df.isnull()
movies_df.isnull().sum()
movies_df.dropna() #Removing null values 按行
movies_df.dropna(axis=1) #Removing null values 按列,axis 来自numpy
Imputation
revenue = movies_df['revenue_millions']
revenue_mean = revenue.mean()
revenue.fillna(revenue_mean, inplace=True)
movies_df.isnull().sum()
movies_df.describe() #每列的统计值,数量、中位数、最大值、最小值等
movies_df['genre'].describe()
movies_df['genre'].value_counts().head(10) #值的频率
movies_df.corr() #列之间的相关性
DataFrame slicing, selecting, extracting
genre_col = movies_df['genre'] #数据类型是series
type(genre_col)
genre_col = movies_df[['genre']] #数据类型是DataFrame
type(genre_col)
subset = movies_df[['genre', 'rating']]
subset.head()
以上主要基于列,基于行的话为有.loc,.iloc
prom = movies_df.loc["Prometheus"]
prom = movies_df.iloc[1] #.iloc行的数字
movie_subset = movies_df.loc['Prometheus':'Sing']
movie_subset = movies_df.iloc[1:4]
slicing
Conditional selections
condition = (movies_df['director'] == "Ridley Scott") #true or false
condition.head()
movies_df[movies_df['director'] == "Ridley Scott"] #filtered false
movies_df[movies_df['rating'] >= 8.6].head(3)
movies_df[(movies_df['director'] == 'Christopher Nolan') | (movies_df['director'] == 'Ridley Scott')].head()
movies_df[movies_df['director'].isin(['Christopher Nolan', 'Ridley Scott'])].head()
movies_df[
((movies_df['year'] >= 2005) & (movies_df['year'] <= 2010))
& (movies_df['rating'] > 8.0)
& (movies_df['revenue_millions'] < movies_df['revenue_millions'].quantile(0.25))
]
Applying functions
def rating_function(x):
if x >= 8.0:
return "good"
else:
return "bad"
movies_df["rating_category"] = movies_df["rating"].apply(rating_function)
movies_df.head(2)
movies_df["rating_category"] = movies_df["rating"].apply(lambda x: 'good' if x >= 8.0 else 'bad')
movies_df.head(2)
Brief Plotting
pip install matplotlib
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 20, 'figure.figsize': (10, 8)}) # set font and plot size to be larger
"""For categorical variables utilize Bar Charts* and Boxplots.For continuous variables utilize Histograms, Scatterplots, Line graphs, and Boxplots."""
movies_df.plot(kind='scatter', x='rating', y='revenue_millions', title='Revenue (millions) vs Rating');
movies_df['rating'].plot(kind='hist', title='Rating');
movies_df['rating'].describe()
movies_df['rating'].plot(kind="box")
movies_df.boxplot(column='revenue_millions', by='rating_category')
Wrapping up
来源:https://www.learndatasci.com/tutorials/python-pandas-tutorial-complete-introduction-for-beginners/
最后附上总的学习网址:
https://github.com/LearnDataSci/article-resources
网友评论