美文网首页
Pandas笔记

Pandas笔记

作者: 生煎小包 | 来源:发表于2019-03-20 11:30 被阅读0次

    Python Data Anlysis Notebook

    SublimeText File

    Data Frame

    • read file

      pd.read_csv(path,index_col=0,chunksize=1000)
      np.shape()#读取矩阵维度
      pd.DataFrame()
      df[['var','var']] #-->dataframe
      df[1:4]
      df.head() #无数据默认前五条
      #how to refer col and row
      df.loc[label]
      df.loc[[row],[col]] #-->dataframe
      df.iloc[index]
      
    • iterating over data

      result=[]
      total=0
      for chunck in pd.read_csv(path,chunksize=1000):
          result.append(sum(chunck['xcolumn']))
          #另一种方法
          total+=sum(chunck['x'])
      total=sum(result)
      
    • NumPy

      np.logical_and(x,y)
      np.logical_or()
      np.logical_not()
      
    • Loop

      enumerate(iterable) #--> index of the value
      for index,value in enumerate(iterable,start=10):
          print(index,value)
      loop over a dictionary:
      dic={}
      for key,value in dic.items():
          print(key,value)
      import numpy as np
      array1=np.array()
      array2=np.array()
      array_total=np.array([array1,array2])
      for val in np.nditer(array_total):
      print(val)
      
      import pandas as pd
      df=pd.read.csv()
      for label,row in df.iterrows():
          print(label)
          print (row)
      
      add a new column in to DataFrame
      for lab,row in df.iterrows():
          df.loc[lab,'name_length']=len(df['country'])
      print(df)
      #更简单的方法是使用apply function
      df['name_length']=df['country'].apply(len)
      apply(str.upper)
      
      pseudo-random
      import numpy as np
      np.random.seed(123)
      coin=np.random.randint(0,2)
      
      '''
      函数将一个数据集合(链表,元组等)中的所有数据进行下列操作:
      用传给 reduce 中的函数 function(有两个参数)先对集合中的第 1、2 个元素进行操作,
      得到的结果再与第三个数据用 function 函数运算,最后得到一个结果。
      '''
      from functools import reduce
      reduce(function,iterable)
      '''
      map() 会根据提供的函数对指定序列做映射。
      第一个参数 function 以参数序列中的每一个元素调用 function 函数,
      返回包含每次 function 函数返回值的新列表。
      '''
      map(function,iterable,...)
      '''
      filter() 函数用于过滤序列,过滤掉不符合条件的元素,返回由符合条件元素组成的新列表。
      该接收两个参数,第一个为函数,第二个为序列,
      序列的每个元素作为参数传递给函数进行判,然后返回 True 或 False,最后将返回 True 的元素放到新列表中。
      '''
      filter(function,iterable)
      
      def func():
      try:
          return function
      except [TypeError]:
          return print()
      
      # 抛出异常
      raise ValueError
      

    Iterators

     #iterating over iterables
     word='DA'
     it=iter(word)
     next(it)
     print(*it) #->once per time
    
     zip() #-> return an iterator of tuples
     list(zip(lst1,lst2))
     print(*zipvarable)
    
    • List comprehensions

      lst=[1,2,3,4]
      lst_add_1=[num+1 for num in lst]
      lst=[num for num in range(11)]
      
      conditionals in comprehensions
      [num**2 for num in range(10) if num%2==0]
      
      dictionary comprehensions
      {num:-num for num in range(10)}
      
      generator->analogous generator object||iterator
      (num**2 for num in range(10) if num%2==0)
      
      generator function->yield
      def num_sequence(n):
          i=0
          while i<n:
              yield i
              entryi+=1
      

    Importing Data in python

     file_name=open('file.txt',mode='r') #only to read
     #'w' is to write
     text=file_name.read()
     file_name.close()
     with open('datacamp.csv','r') as datacamp #context manager
     file_objeect.readline()
     ! ls #will display the contents of your current directory.
    
    • flat files (contrain record,row of fields or attributes, column is feature or attributes)

      #can have a header
      #numpy:storing numerical data
      import numpy as np
      filename='MNIST.txt'
      data=np.loadtxt(filename,delimiter=',',skiprows=1,usecols=[0,2],dtype=str) #跳过第一行 #只要第一和三列数据
      
      #import with different type of data
      np.genfromtext(filename,delimiter=',',names=True,dtype=None)
      
      import pandas as pd
      filename='winequality-red.csv'
      data=pd.read_csv(filename,nrows=5,header=None)#first 5 rows of the file,there is no header in this file
      
      #convert dataframe to a numpy array
      data_array=data.values
      
    • Other files

      data=pd.ExcelFile(filename)
      print(data.sheet_names)#figure sheet name out
      df1=data.parse('sheetname')#sheet name
      df2=data.parse(0,skiprows=[0],parse_cols[0],names=['Country'])#sheet index
      
      #sas file
      import pandas as pd
      from sas7bdat import SAS7DBAT
      with sas7bdat('file.sas7bdat') as file:
          df_sas=file.to_data_drame()
      
      #stata file_name
      import pandas as pd
      data=pd.read_stata('file.dta')
      
      #HDF5 file
      import h5py
      file_name='file.hdf5'
      data=h5py.File(file_name,'r')
      for key in data.keys():
          print(key)
      
      #matlab file
      import scipy.io
      scipy.io.loadmat() #- read.mat files
      scipy.io.savemet() #- write.mat files
      
    • Relational Database

      # Open engine in context manager
      # Perform query and save results to DataFrame: df
      
      engine = create_engine('sqlite:///Chinook.sqlite')
      with engine.connect() as con:
          rs = con.execute("SELECT LastName, Title FROM Employee")
          df = pd.DataFrame(rs.fetchmany(size=3))
          df.columns = rs.keys()
      # Print the length of the DataFrame df
      print(len(df))
      # Print the head of the DataFrame df
      print(df.head())
      
      # Import packages
      from sqlalchemy import create_engine
      import pandas as pd
      # Create engine: engine
      engine = create_engine('sqlite:///Chinook.sqlite')
      # Execute query and store records in DataFrame: df
      df = pd.read_sql_query("SELECT * From Album", engine)
      # Print head of DataFrame
      print(df.head())
      

    enumerate #返回数据和其位置

     test=[1,2,3,4,5]
     for i, num in enumerate(test):
     print (i,m)
    

    Clean Data

    1. Sublime笔记整理

        # Print the head of df
        print(df.head())
        # Print the tail of df
        print(df.tail())
        # Print the shape of df
        print(df.shape)
        # Print the columns of df
        print(df.columns)
        # Rename the columns
        gapminder_melt.columns = ['country', 'year', 'life_expectancy']
        # Print the info of df,provides important information about a DataFrame
        print(df.info())
        # Print the value_counts for 'State'
        print(df.State.value_counts(dropna=False))
        # Print the value counts for 'Site Fill'
        print(df['Site Fill'].value_counts(dropna=False))
    
    • Transpose

      # Melt airquality: airquality_melt,The id_vars represent the columns of the data you do not want to melt (i.e., keep it in its current shape),
      # while the value_vars represent the columns you do wish to melt into rows.
      airquality_melt = pd.melt(frame=df,id_vars=['Month','Day'], value_vars=['Ozone','Solar.R','Wind','Temp'],var_name='measurement', value_name='reading')
      
    • Pivot_Table

      #(非重复数据)
      # Pivot airquality_melt: airquality_pivot
      airquality_pivot = airquality_melt.pivot_table(index=['Month','Day'], columns='measurement', values='reading')
      #(重复数据)
      # Pivot table the airquality_dup: airquality_pivot
      airquality_pivot = airquality_dup.pivot_table(index=['Month', 'Day'], columns='measurement', values='reading', aggfunc=np.mean)
      

      层次化索引(PIVOT之后)

      # Print the index of airquality_pivot
      print(airquality_pivot.index)
      # Reset the index of airquality_pivot: airquality_pivot_reset
      airquality_pivot_reset = airquality_pivot.reset_index()
      
    • Melting and Parsing

      # Melt tb: tb_melt
      tb_melt = pd.melt(frame=tb,id_vars=['country', 'year'])
      # Create the 'gender' column
      tb_melt['gender'] = tb_melt.variable.str[0]
      # Melt ebola: ebola_melt
      ebola_melt = pd.melt(ebola, id_vars=['Date', 'Day'], var_name='type_country', value_name='counts')
      # Create the 'str_split' column
      ebola_melt['str_split'] = ebola_melt.type_country.str.split('_')
      # Create the 'type' column
      ebola_melt['type'] = ebola_melt['str_split'].str.get(0)
      # Create the 'country' column
      ebola_melt['country'] = ebola_melt['str_split'].str.get(1)
      
    • Concatenation Data

      # Concatenate uber1, uber2, and uber3: row_concat
      row_concat = pd.concat([uber1,uber2,uber3])
      # Concatenate ebola_melt and status_country column-wise: ebola_tidy
      ebola_tidy = pd.concat([ebola_melt,status_country],axis=1)
      
      #Tons of files to concat
      # Import necessary modules
      import glob
      import pandas as pd
      # Write the pattern: pattern
      pattern = '*.csv'
      # Save all file matches: csv_files
      csv_files = glob.glob(pattern)
      # Print the file names
      print(csv_files)
      # Load the second file into a DataFrame: csv2
      csv2 = pd.read_csv(csv_files[1])
      # Print the head of csv2
      print(csv2.head())
      

    2. 函数

    df.sample() 用法参考

     df.sample(frac=0.5,replace=True,random_state=123) #按比例抽取,是否有序放回,设置随机种子
    

    df.isnull()

     df.isnull().values.any() #.values返回array,.any()返回True False
     df.isnull().sum() #sum null
    
    

    df.groupby()

        #groupby后面聚类算法__猜测只能为num__
        df.groupby('col_name').sum()
    

    Pandas Foundations

    1. Inspectiong Data

      import pandas as pd
      type(df)
      type(df.columns)
      type(df.index)
      df.describe()
      df.shape
      df.columns
      df.index
      df.iloc[:5,:]
        df.corr() #only numbers
      # broadcasting, assigning scala value to col slice broadcasts value to each rows
      df.iloc[::3,-1]=np.nan
      #Series
      low=df['Low']
      type(low)
      low.head()
      lows=low.values
      type(lows)
      #View the first few and last few rows of a DataFrame
      df.head()
      df.tail()
    

    count values

     df['col_name'].value_counts()
     df.col_name.value_counts()
    

    show unique value

     df['col_name'].unique()
     df.col_name.unique()
    

    2. Numpy and Pandas Together

      import numpy as np
      #.values to represent a DataFrame df as a NumPy array.
      np_vals = df.values
      # np.log10() method to compute the base 10 logarithm
      np_vals_log10 = np.log10(np_vals)
      df_log10 = np.log10(df)
      [print(x, 'has type', type(eval(x))) for x in ['np_vals', 'np_vals_log10', 'df', 'df_log10']]
    

    3. Zip list to build a df

      #Zip the 2 lists together into one list of (key,value) tuples: zipped
      zipped = list(zip(list_keys,list_values))
      print(zipped)
      data = dict(zipped)
      df = pd.DataFrame(data)
      print(df)
    
    • Labeling Data

        list_labels = ['year', 'artist', 'song', 'chart weeks']
        # Assign the list of labels to the columns attribute: df.columns
        df.columns=list_labels
      

    4. Reading & Saving

    • Reading a flat file

        df=pd.read_csv(file_name,header=None,names=col_name,na_values={'column_name':[' -1']},parse_date=[[0,1,2]])
        df.index=df['date_time']
        df.index.name='key_date'
        #trimming redundant columns
        cols=['sunspots','definite']
        df=df[cols]
      
        # header=0 to rename the column labels
        new_labels = ['year','population']
        df2 = pd.read_csv(data_file, header=0, names=new_label
      
    • Delimiters, headers, and extensions

        df2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')
        # Save the cleaned up DataFrame to a CSV file without the index
        df2.to_csv(file_clean, index=False)
        # Save the cleaned up DataFrame to an excel file without the index
        df2.to_excel('file_clean.xlsx', index=False)
      

    5. Plot

    • Ployting series using pandas

        # Create a plot with color='red'
        df.plot(color='red')
        # Add a title
        plt.title('Temperature in Austin')
        # Specify the x-axis label
        plt.xlabel('Hours since midnight August 1, 2010')
        # Specify the y-axis label
        plt.ylabel('Temperature (degrees F)')
        # Display the plot
        plt.show()
      
        #fix scales
        plt.yscale('log')
        #legend
        df.plot(legend=True)
        #axis
        plt.axis(('2001','2002',0,100))
        #saving plot
        plt.savefig()
      
    • Plotting DataFrame

        # Plot all columns (default)
        df.plot()
        plt.show()
      
        # Plot all columns as subplots
        df.plot(subplots=True)
        plt.show()
      
        # Plot just the Dew Point data
        column_list1 = ['Dew Point (deg F)']
        df[column_list1].plot()
        plt.show()
        # Plot the Dew Point and Temperature data, but not the Pressure data
        column_list2 = ['Temperature (deg F)','Dew Point (deg F)']
        df[column_list2].plot()
        plt.show()
      
    • Plots

            # s for size,c for color,alpha for opacity
        df.plot(kind='scatter',x='HP',y='MPG',s=sizes,alpha=,c=)
        df[cols].plot(kind='box',subplots=True)
            # Put the x-axis on a logarithmic scale
            plt.xscale('log')
      
        # Create a list of y-axis column names: y_columns
        y_columns = ['AAPL','IBM']
        df.plot(x='Month', y=y_columns)
        plt.title('Monthly stock prices')
        plt.ylabel('Price ($US)')
        plt.show()
            plt.hist(x,bins=int)
            # clears the entire current figure与其所有轴,但是窗口打开,这样它可以被重复使用其他地块。
            plt.clf()
      
        #plotting the PDF, only normed=True; CDF, cumulative=True in addition to normed=True
      
        # This formats the plots such that they appear on separate rows
        fig, axes = plt.subplots(nrows=2, ncols=1)
      
        # Plot the PDF
        df.fraction.plot(ax=axes[0], kind='hist', normed=True,bins=30, range=(0,.3))
        plt.show()
      
        # Plot the CDF
        df.fraction.plot(ax=axes[1], kind='hist', normed=True, bins=30, cumulative=True, range=(0,.3))
        plt.show()
      
      • Appendix
      #Histogram
      # Import matplotlib.pyplot
      import matplotlib.pyplot as plt
      # Describe the column
      df['Existing Zoning Sqft'].describe()
      # Plot the histogram
      df['Existing Zoning Sqft'].plot(kind='hist', rot=70, logx=True, logy=True)
      
      #Boxplot
      # Create the boxplot
      df.boxplot(column='initial_cost', by='Borough', rot=90)
      
      #Scatter Plot
      # Create and display the first scatter plot
      df.plot(kind='scatter', x='initial_cost', y='total_est_fee', rot=70)
      

    6. Statistical exploratory data analysis

    • pandas和numpy涉及到axis参数的问题

      参阅: 如何理解pandas中axis参数

        #summary stat of numerical cols of DataFrame
        df.describe()
        df.max()
        df.min()
        df.quantile([0.05,0.95])
        df.std()
      
        #axis='columns' computes the mean across all columns per row.
        mean = df.mean(axis='columns')
        mean.plot()
      
    • Separating populations with Boolean indexing

        df['col'].unique()
        df[df['origin'] == 'US'].count()
      
      • 小case

          # Display the box plots on 3 separate rows and 1 column
          fig, axes = plt.subplots(nrows=3, ncols=1)
        
          # Generate a box plot of the fare prices for the First passenger class
          titanic.loc[titanic['pclass'] == 1].plot(ax=axes[0], y='fare', kind='box')
        
          # Generate a box plot of the fare prices for the Second passenger class
          titanic.loc[titanic['pclass'] == 2].plot(ax=axes[1], y='fare', kind='box')
        
          # Generate a box plot of the fare prices for the Third passenger class
          titanic.loc[titanic[ 'pclass']==3].plot(ax=axes[2], y='fare', kind='box')
        
          # Display the plot
          plt.show()
        

    7. Time Series

    • Creating DatetimeIndex

        # Prepare a format string: time_format
        time_format = '%Y-%m-%d %H:%M'
      
        # Convert date_list into a datetime object: my_datetimes
        my_datetimes = pd.to_datetime(date_list, format=time_format)
      
        # Construct a pandas Series using temperature_list and my_datetimes: time_series
        time_series = pd.Series(temperature_list, index=my_datetimes)
      
    • Extracting data using datetime range

        ts1 = ts0.loc['2010-10-11 21:00:00':'2010-10-11 22:00:00']
        ts2 = ts0.loc['2010-07-04']
        ts3 = ts0.loc['2010-12-15':'2010-12-31']
      
    • Reindexing

    Input Description
    'min','T' minute
    'H' hour
    'D' day
    'B' business day
    'W' week
    'M' month
    'Q' quarter
    'A' year
    • Resampling time series

      Downsampling: reduce datetime rows to slower freq. day->week
      Upsampling: increase datetime rows to faster freq. day->hour

      df.resample('D').mean()
      df.resample('4H').ffill()
      

    8. 防盗

    Seaborn 参考

    • Format Setting

      有5个seaborn的主题
      - darkgrid 黑色网格(默认)
      - whitegrid 白色网格
      - dark 黑色背景
      - white 白色背景
      - ticks 四周都有刻度线的白背景

      sns.set_style("whitegrid")
      

      画布大小

      plt.figure(figsize=(7,7))
      

      坐标轴倾斜45°

      plt.xticks(rotation=45)
      
    • Bar Plot

      seaborn.barplot (x=None, y=None, hue=None, data=None, order=None, hue_order=None, estimator=, ci=95, n_boot=1000, units=None, orient=None, color=None, palette=None, saturation=0.75, errcolor='.26', errwidth=None, capsize=None, dodge=True, ax=None, **kwargs)
      - x,y,hue : names of variable in data or vector data
      - data : DataFrame,array or list of array,optional
      - color :matplotlib color,optional
      - palette : palette name,list, or dict,optional 参考调色板教程
      - ax : matplotlib Axes,optional
      - hue: 类似于label

      #hue也可以换为['female','male'],palette=sns.cubehelix_palette(120)
      ax=sns.barplot(x=data.gender.value_counts().index,y=data.gender.value_counts(), hue=data['gender'].value_counts().index,palette="Greens_d")
      plt.legend(loc=8)
      plt.xlabel('Gender')
      plt.ylabel('Frequency') #另一种写法ax.set_ylabels()
      plt.title('Show of Gender Bar Plot')
      plt.show()
      
    • Catplot 参考文章 (理解为高级汇总,kind='types')

      snsborn.despine

      plt.figure(figsize=(10,10))
      g=sns.catplot(x='gender',y='math score',data=data,hue='lunch',kind='bar',height=4, aspect=1)
      g.despine(left=True) #移除画布框线
      plt.tight_layout() #紧缩排列
      plt.show()
      
      f,ax=plt.subplots(figsize=(9,10))
      sns.barplot(y=data.gender.value_counts().index,x=data.gender.value_counts()
                  ,label='gender',alpha=0.5,color='red')
      sns.barplot(y=data['race/ethnicity'].value_counts().index,x=data['race/ethnicity'].value_counts(),
                    color='blue',label='race',alpha=0.7)
      ax.legend(loc='upper right',frameon=True)
      ax.set(xlabel='Gender , Race/Ethnicity',ylabel='Groups',title="Gender vs Race/Ethnicity ")
      plt.show()
      
    • Point Plot

      seaborn.pointplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None, estimator=, ci=95, n_boot=1000, units=None, markers='o', linestyles='-', dodge=False, join=True, scale=1, orient=None, color=None, palette=None, errwidth=None, capsize=None, ax=None, **kwargs)
      - x, y, hue : names of variables in data or vector data, optional
      - data : DataFrame, array, or list of arrays, optional
      - order, hue_order : lists of strings, optional
      - markers : string or list of strings, optional
      - linestyles : string or list of strings, optional
      - color : matplotlib color, optional
      - palette : palette name, list, or dict, optional
      - ax : matplotlib Axes, optional

      f,ax1=plt.subplots(figsize=(25,10))
      sns.pointplot(x=np.arange(1,191), y=data[(data['Race/Ethnicity']=='group B')].Math_Score,color='blue',alpha=0.6)
      sns.pointplot(x=np.arange(1,191),y=data[(data['Race/Ethnicity']=='group B')].Reading_Score,color='pink',alpha=0.2)
      ax1.set(xlabel='Group B index State',ylabel='Frequency',title="Group B Math Score & Reading_Score") #也可以用plt.xlabel
      plt.xticks(rotation=90) #轴旋转
      plt.grid() #加网线
      plt.show()
      
      ax=sns.pointplot(x='Writing_Score',y='Math_Score',hue='Gender',data=data,markers=["o", "x"],linestyles=["-", "--"])
      plt.legend(loc='lower right')
      plt.xticks(rotation=90)
      plt.show()
      
    • Joint Plot 包含核密度估计 KDE图

      seaborn.jointplot(x, y, data=None, kind='scatter', stat_func=None, color=None, height=6, ratio=5, space=0.2, dropna=True, xlim=None, ylim=None, joint_kws=None, marginal_kws=None, annot_kws=None, **kwargs)
      - x, y : strings or vectors
      - data : DataFrame, optional
      - kind : { “scatter” | “reg” | “resid” | “kde” | “hex” }, optional
      - color : matplotlib color, optional
      - dropna : bool, optional

      plt.figure(figsize=(10,10))
      #kind='kde' 核密度图
      sns.jointplot(x=np.arange(1,191),y=data[(data['Race/Ethnicity']=='group B')].Math_Score,color='lime',kind='hex',alpha=0.8)
      plt.xlabel('Group B index State')
      plt.ylabel('Frequency')
      plt.title('Group B Frequency Race/Ethnicity')
      plt.xticks(rotation=90)
      plt.tight_layout()
      plt.show()
      
      #散点图+KDE图
      data['Race/Ethnicity'].unique()
      len(data[(data['Race/Ethnicity']=='group B')].Math_Score)
      plt.figure(figsize=(10,10))
      sns.jointplot(x=np.arange(1,191),y=data[(data['Race/Ethnicity']=='group B')].Reading_Score,color='k').plot_joint(sns.kdeplot, zorder=0, n_levels=6)
      plt.xlabel('Group B index State')
      plt.ylabel('Frequency')
      plt.title('Group B Math Score & Reading_Score')
      plt.xticks(rotation=90)
      plt.show()
      
    • KDE Plot (jointplot中可以kind='kde')

      seaborn.kdeplot(data, data2=None, shade=False, vertical=False, kernel='gau', bw='scott', gridsize=100, cut=3, clip=None, legend=True, cumulative=False, shade_lowest=True, cbar=False, cbar_ax=None, cbar_kws=None, ax=None, **kwargs)
      - data : 1d array-like
      - data2: 1d array-like, optional
      - shade : bool, optional
      - vertical : bool, optional
      - kernel : {‘gau’ | ‘cos’ | ‘biw’ | ‘epa’ | ‘tri’ | ‘triw’ }, optional
      - cut : scalar, optional
      - legend : bool, optional
      - ax : matplotlib axes, optional

    • Pie chart

      plt.figure(figsize=(7,7))
      #explode是偏离轴心(可小数),shadow是阴影
      plt.pie(data['Race/Ethnicity'].value_counts().values,explode=[0,0,0.0,0,0.1],labels=data['Race/Ethnicity'].value_counts().index,colors=['pink','grey','yellow','lime','brown'],autopct='%1.1f%%',shadow=True)
      plt.title('Race/Ethnicity According Analysis',color='black',fontsize=10)
      plt.legend(data['Race/Ethnicity'].value_counts().index,loc=2)
      plt.axis('equal') #避免比例压缩为椭圆
      plt.show()
      
    • LM Plot

      seaborn.lmplot(x, y, data, hue=None, col=None, row=None, palette=None, col_wrap=None, height=5, aspect=1, markers='o', sharex=True, sharey=True, hue_order=None, col_order=None, row_order=None, legend=True, legend_out=True, x_estimator=None, x_bins=None, x_ci='ci', scatter=True, fit_reg=True, ci=95, n_boot=1000, units=None, order=1, logistic=False, lowess=False, robust=False, logx=False, x_partial=None, y_partial=None, truncate=False, x_jitter=None, y_jitter=None, scatter_kws=None, line_kws=None, size=None)
      - x, y : strings,options
      - data : DataFrame, optional
      - hue, col, row : strings
      - palette : palette name, list, or dict, optional
      - markers : matplotlib marker code or list of marker codes, optional
      - legend : bool, optional
      - scatter : bool, optional

      sns.lmplot(x='Math_Score',y='Writing_Score',data=data,hue='Gender',markers=['x','o'])
      plt.ylabel('Writing Score')
      plt.xlabel('Math Score')
      plt.show()
      

    Practices

    • 练习resampling

        import numpy as ny
        import pandas as pd
        import matplotlib.pyplot as plt
        df=pd.DataFrame(pd.read_csv('test.csv'))
      
        #给特定列重命名
        df=df.rename(columns ={'_c1':'date'})
      
        #删除where col_name is null的rows
        df1=df[df['date'].notnull()]
      
        #将目标列设置为index
        df1.set_index(['date'],inplace=True)
      
        #resample只能适用于datetimeindex,因此将index转为datetimeindex格式
        df1.index=pd.DatetimeIndex(df1.index)
      
        #对第二列开始的数据进行分类求和
        df1.iloc[:,1:].resample('W').sum()
      
        #对id分组再进行resampling
        df1.groupby('id').resample('W').sum().iloc[:,1:]
      

    相关文章

      网友评论

          本文标题:Pandas笔记

          本文链接:https://www.haomeiwen.com/subject/mghzmqtx.html