Python: Pandas

作者: 欺枫 | 来源:发表于2020-11-04 20:17 被阅读0次

    pip install xlrd 可以用来读取excel

    import pandas as pd

    import datetime

    import matplotlib

    import matplotlib.pyplot as plt

    import os      #os 适用于windows的路径,包括路径拼接等

    cur_dir = os.path.dirname(__file__)

    # __file__  为当前.py file 的位置

    df = pd.read_csv(os.path.join(cur_dir,"employees.csv"))

    #print(cur_dir)

    #print(df)

    #

    # cur_dir is C:/Users/T596494/FZPython/Insideout2_dist

    # os.path.join(cur_dir,"employees.csv") is C:/Users/T596494/FZPython/Insideout2_dist\employees.csv

    # it can automatically add \

    # when print df, it will print the file details

    # df will show up the whole document's details - "employees.csv"

    print(df.shape)

    # shape will show the number of rows and columns of the data. - (1000, 4) 1000 rows, 4 columns 不含title的数据行数

    # print(df.dtypes)

    # dtypes check each columns' data type

    # employee_id        int64

    # annual_inc        float64

    # employee_title    object

    # home_ownership    object

    # dtype: object

    print(df.columns)

    # Index(['employee_id', 'annual_inc', 'employee_title', 'home_ownership'], dtype='object')

    # show each columns' title

    print(df.head())

    #df.head() 可以显示前面几行的数据,如果在括号里面输入3,print(df.head(3)) 则显示三行数据(包括title则为4行)

    print(df.sample(10))

    #随机显示10行数据,若不写,则只显示1行

    print(df.tail(5))

    #显示最后5行的数据

    #把title为annual_inc的列储存到annual_incs的变量中。annual_incs的属性同df     

    #Thisis known as a Series.

    annual_incs = df['annual_inc']

    print(annual_incs)

    print(type(annual_incs))

    print(annual_incs.head())

    print(annual_incs.shape)

    #将第二行的数据存到row_2,并且将相应数据显示出来

    row_2 = df.iloc[1]

    print(row_2)

    emp_id = row_2['employee_id']

    emp_title = row_2['employee_title']

    print('The employee with ID {emp_id} has the job title {emp_title}.' .format(emp_id = emp_id,emp_title = emp_title))

    #筛选

    Filtering by ONE condition, numeric

    To perform filtering by numeric values, first check the column is a numeric column (int64 or float64).

    Then use the following formula to do so:

    df[df['column_name']<conditional operator><value>]

    #筛选 df中年收入大于 300000的所有行

    print(df[df['annual_inc'] >=300000])

    #对字符串类型的数据的筛选

    #选出所有home_ownership 是OWN的数据

    print(df[df['home_ownership'] =='OWN'].head())

    #选出所有employee_title 是‘Accountant'或者'Sales'的数据

    print(df[df['employee_title'].isin(['Accountant','Sales'])])

    #多条件筛选

    #选出employee_title 是President, 并且annual_inc 超过225000的数据

    print(df[ (df['employee_title'] =='President') & (df['annual_inc'] >=225000)])

    #选出employee_title是Sales, 并且annual_inc 超过100000 或者小于 80000

    print(df[(df['employee_title']=='Sales') & ((df['annual_inc']>=100000) | (df['annual_inc']<=80000))])

    #保存,用df的id对df2的id做vlookup,并且保存到output.csv

    df2 = pd.read_csv(os.path.join(cur_dir,"employees2.csv"))

    output = df[df['employee_id'].isin(df2['employee_id'])]

    output.to_csv("output.csv",index=False)

    #显示文件的信息

    print(df.info())

    #Draw graph with matplotlib

    df.groupby('home_ownership')['home_ownership'].value_counts().unstack().plot(kind='bar',figsize=(10,5))

    plt.show()

    相关文章

      网友评论

        本文标题:Python: Pandas

        本文链接:https://www.haomeiwen.com/subject/crzgvktx.html