pandas sql使用笔记

作者: 叫我老村长 | 来源:发表于2019-12-21 12:29 被阅读0次

    刚接触的pandas时候,感觉使用 pandasql 更加方便点。现在原生方式用多了也觉得灵活性更大。

    # 引入
    import pandas as pd
    import numpy as np
    
    import pymysql
    
    # 数据集创建
    df = pd.DataFrame(datalist) # dict
    # sql
    connection = pymysql.connect(host='192.168.1.1', user='root', password='root',
                                 db='db1',charset='utf8mb4',
                                 cursorclass=pymysql.cursors.DictCursor)
    global connection
    
    sql = "SELECT * FROM server"
    df = pd.read_sql(sql,con=connection)
    df = pd.read_sql(sql,con=connection, index_col='id')
    df = df.fillna("无")
    
    # 存表
    # pd.io.sql.write_sql(df,'tablename',connection) 无法使用
    # pd.io.sql.to_sql(df, "tablename", conn,flavor='mysql',if_exists='append')
    
    from sqlalchemy import create_engine
    engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/lib?charset=utf8')
    df.to_sql("tablename", engine, if_exists='append',index=False)
    # 速度太慢试试这个
    connection = engine.raw_connection() #engine 是 from sqlalchemy import create_engine  
    cursor = connection.cursor()  
    # null value become ''
    cursor.copy_from(output,table_name,null='')  
    connection.commit()  
    cursor.close()  
    
    # 结果转list
    df['col'].tolist()
    
    # # 结果转dict
    # tmp = df.T.to_dict()
    # tmp = tmp.values()
    # tmp = list(tmp)
    # return tmp
    
    # tmp = df_all.T.to_dict()
    # tmp_list= sorted(tmp.items(), key=lambda d:d[0])
    # atmp = []
    # for x in tmp_list:
    #     atmp.append(x[1])        
    # return atmp
    
    user_dict = {12: {'Category 1': {'att_1': 1, 'att_2': 'whatever'},
                      'Category 2': {'att_1': 23, 'att_2': 'another'}},
                 15: {'Category 1': {'att_1': 10, 'att_2': 'foo'},
                      'Category 2': {'att_1': 30, 'att_2': 'bar'}}}
    
    pd.DataFrame.from_dict({(i,j): user_dict[i][j] 
                               for i in user_dict.keys() 
                               for j in user_dict[i].keys()},
                           orient='index')
    
    '''
                   att_1     att_2
    12 Category 1      1  whatever
       Category 2     23   another
    15 Category 1     10       foo
       Category 2     30       bar
    '''
    # https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_dict.html
    df.to_dict(orient='records') # list 数组
    
    df.to_dict(orient='split') # list 名字 + data
    
    with connection.cursor() as cursor:
        try:
            cursor.execute(sql)
            row = cursor.fetchone()
            rows = cursor.fetchall()  
        except Exception as e:
            print(e,sql) 
    connection.commit()
    return row
    
    # np
    dates = pd.data_range('20170101', periods=6)
    df = pd.DataFrame(np.random.randn(6,4), index=dates, cloumns=['a','b','c','d']) # dict
    
    # excel
    df = pd.read_excel(path+'/data.xlsx', sheet_name=0, index_col=None, na_values=['NA'], encoding='utf-8') 
    
    #df = pd.read_excel(filefullpath, sheetname=[0,2],skiprows=[0])
    #sheetname指定为读取几个sheet,sheet数目从0开始
    #如果sheetname=[0,2],那代表读取第0页和第2页的sheet
    #skiprows=[0]代表读取跳过的行数第0行,不写代表不跳过标题
    #df = pd.read_excel(filefullpath, sheetname=None ,skiprows=[0])
    
    print df
    print type(df)
    #若果有多页,type(df)就为<type 'dict'>
    #如果就一页,type(df)就为<class 'pandas.core.frame.DataFrame'>
    #{0:dataframe,1:dataframe,2:dataframe}
    
    # csv
    pwd = os.getcwd()
    os.chdir(os.path.dirname(trainFile))
    trainData = pd.read_csv(os.path.basename(trainFile))
    os.chdir(pwd)
    
    df = pd.read_csv(path+'/data.csv', index_col=None, na_values=['NA'], encoding='utf-8') 
    
    # 超大文件的处理
    df = pd.read_csv(path+'/data.csv', index_col=None, na_values=['NA'], encoding='utf-8',iterator = True)
    chunk = df.get_chunk(5)  #看看前五个
    df.info(memory_usage='deep') #看看总体信息
    
    # 或者分批处理
    # usecols 制定列 dtype 改类型 object 列转换为 Category 内存变小
    reader =pd.read_csv(path+'/data.csv', index_col=None, na_values=['NA'], encoding='utf-8',iterator = True)
    chunkSize = 100000
    loop = True
    chunks = []
    while loop:
        try:
            chunk = reader.get_chunk(chunkSize)
            chunks.append(chunk)
        except StopIteration:
            loop = False
            print("Iteration is stopped.")
    df = pd.concat(chunks, ignore_index=True)
    print(df)
    
    # data
    df = pd.read_table(path+'/data.dat', index_col=None, na_values=['NA'], encoding='utf-8') 
    # tocsv
    df.to_csv(path+'/data.csv', encoding='utf-8', index=False) 
    # http://pandas.pydata.org/pandas-docs/stable/io.html
    df.to_html()
    
    '''
    pd.read_csv(filename):从CSV文件导入数据
    pd.read_table(filename):从限定分隔符的文本文件导入数据
    pd.read_excel(filename):从Excel文件导入数据
    pd.read_sql(query, connection_object):从SQL表/库导入数据
    pd.read_json(json_string):从JSON格式的字符串导入数据
    pd.read_html(url):解析URL、字符串或者HTML文件,抽取其中的tables表格
    pd.read_clipboard():从你的粘贴板获取内容,并传给read_table()
    pd.DataFrame(dict):从字典对象导入数据,Key是列名,Value是数据
    
    df.to_csv(filename):导出数据到CSV文件
    df.to_excel(filename):导出数据到Excel文件
    df.to_sql(table_name, connection_object):导出数据到SQL表
    df.to_json(filename):以Json格式导出数据到文本文件
    '''
    
    '''
    pd.DataFrame(np.random.rand(20,5)):创建20行5列的随机数组成的DataFrame对象
    pd.Series(my_list):从可迭代对象my_list创建一个Series对象
    df.index = pd.date_range('1900/1/30', periods=df.shape[0]):增加一个日期索引
    '''
    
    # 数据反转
    df[::-1].reset_index(drop=True)
    
    # 查看数据集
    df.dtypes # 数据类型
    df_hour['count']= df_hour['count'].astype(float) # 换数据类型
    df.head df.tail
    
    '''
    df.head(n):查看DataFrame对象的前n行
    df.tail(n):查看DataFrame对象的最后n行
    df.shape:查看行数和列数
    http://df.info():查看索引、数据类型和内存信息
    df.describe():查看数值型列的汇总统计
    s.value_counts(dropna=False):查看Series对象的唯一值和计数
    df.apply(pd.Series.value_counts):查看DataFrame对象中每一列的唯一值和计数
    '''
    
    df.index df.index = range(1,len(df)+1) #重建索引
    # 排序
    df = df.sort_values('count', ascending=False)
    df = df.sort_values(by=['ip','origservice'])
    # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html?highlight=sort_values#pandas.DataFrame.sort_values
    # 子集
    df = df[['title', 'filename', 'from', 'to']]
    
    # 选择数据 
    df['cloumns']
    # 切片
    df[1:3]  df['indexstart':'indexend']
    # 子集 loc通过标签选择数据,iloc通过位置选择数据
    df.loc['indexstart':'indexend',['c1','c2']] 
    # 提取一个值
    df.loc['indexs','c1']
    # 第一行 series
    df.iloc[0] 
    # 子集
    df.iloc[1:2,3:5] 
    df.iloc[[1,3,5],:] 
    # 游标提取一个值
    df.iat[1,1]
    # 列取值
    df.loc[1index,'col'] 
    df['col'].iloc[0]
    '''
    df[col]:根据列名,并以Series的形式返回列
    df[[col1, col2]]:以DataFrame形式返回多列
    s.iloc[0]:按位置选取数据
    s.loc['index_one']:按索引选取数据
    df.iloc[0,:]:返回第一行
    df.iloc[0,0]:返回第一列的第一个元素
    '''
    
    #筛选
    # http://pandas.pydata.org/pandas-docs/stable/comparison_with_sql.html
    
    SELECT total_bill, tip, smoker, time FROM tips LIMIT 5;
    tips[['total_bill', 'tip', 'smoker', 'time']].head(5)
    
    SELECT * FROM tips WHERE time = 'Dinner' LIMIT 5;
    is_dinner = tips[tips['time'] == 'Dinner'].head(5)
    is_dinner.value_counts()
    nlargest(3) #最大的三条
    
    SELECT * FROM tips WHERE time = 'Dinner' AND tip > 5.00;
    tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)]
    
    SELECT * FROM tips WHERE size >= 5 OR total_bill > 45;
    tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)]
    
    SELECT * FROM tips WHERE size in (1,2,3,4,5);
    inlist = [1,2,3,4,5]
    tips['size'].isin(inlist)
    tips = tips.loc[(tips['size'] == 1) | (tips['size'] == 2)]
    ~tips['size'].isin(inlist) # not in
    tips[~tips['size'].isin(inlist)]
    tips.dropna(how='all')
    
    SELECT * FROM frame WHERE col2 IS NULL;
    frame[frame['col2'].isnull()]
    
    SELECT * FROM frame WHERE col1 IS NOT NULL;
    frame[frame['col1'].notnull()]
    
    df_downfile = self.df[~self.df['filename'].isnull() & ((self.df['trace_t']!='downfile_http') | (self.df['trace_t']=='web_url'))]
    
    SELECT sex, count(*) FROM tips GROUP BY sex;
    tips.groupby('sex').size()
    # count 返回每一个值
    tips.groupby('sex')['total_bill'].count().reset_index(name='counts')
    df.groupby(['to'])['to'].transform('count') #每条后面都跟着大小
    
    SELECT day, AVG(tip), COUNT(*) FROM tips GROUP BY day;
    tips.groupby('day').agg({'tip': np.mean, 'day': np.size})
    
    # 可以一次应用多个功能。 例如,假设我们希望看到提示量与星期几不同 - agg()允许您将字典传递到分组的DataFrame,指示哪些函数应用于特定的列。
    # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.groupby.DataFrameGroupBy.agg.html?highlight=agg#pandas.core.groupby.DataFrameGroupBy.agg
    SELECT smoker, day, COUNT(*), AVG(tip) FROM tips GROUP BY smoker, day;
    tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]}) #np.std https://docs.scipy.org/doc/numpy/reference/routines.statistics.html
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.aggregate.html
    df.agg("mean", axis="columns")
    0    2.0
    1    5.0
    2    8.0
    3    NaN
    
    # 使用DataFrame模糊筛选数据(类似SQL中的LIKE):
    # https://pandas.pydata.org/pandas-docs/stable/api.html#string-handling
    df[df['plan'].str.contains(r'.*?user.*')] #使用正则表达式进行模糊匹配,*匹配0或无限次,?匹配0或1次
    # 使用DataFrame进行数据转换(后期补充说明)
    df['plan'] = df_obj['plan'].str.replace('user(.{2,})info','\\1')#可以使用正则表达式
    # apply 详细说明 
    # http://stackoverflow.com/questions/17841149/pandas-groupby-how-to-get-a-union-of-strings/17841294
    # https://my.oschina.net/lionets/blog/280332
    #相同的合并到一个数组
    df_o = df.groupby('name')['a'].apply(lambda x: "{%s}" % ', '.join(str(x)))
    df_a = df.groupby('name')['a'].apply(list)
    df_b = df.groupby('name')['a'].unique().reset_index() #推荐
    
    # 分段
    pd.cut(df['age'],bins=['0,18,35,60,90'],labels=[...])
    
    SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key;
    pd.merge(df1, df2, on='key')
    df_crc = pd.merge(df_crc, df_crc_list, how='left', left_on="app_crc", right_on="crc") # how='outer'
    
    SELECT city, rank FROM df1 UNION ALL SELECT city, rank FROM df2;
    pd.concat([df1, df2],ignore_index=True) #无索引才能做sum
    '''
    df1.append(df2):将df2中的行添加到df1的尾部
    df.concat([df1, df2],axis=1):将df2中的列添加到df1的尾部
    df1.join(df2,on=col1,how='inner'):对df1的列和df2的列执行SQL形式的join
    '''
    
    SELECT * FROM tips ORDER BY tip DESC LIMIT 10 OFFSET 5;
    tips.nlargest(10+5, columns='tip').tail(10)
    tips(10+5, columns='tip').sort_values('tip', ascending=False).tail(10)
    
    # 例子 这个这样写不好
    SELECT * FROM ( SELECT t.*, ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn FROM tips t ) WHERE rn < 3 ORDER BY day, rn;
    (tips.assign(rn=tips.sort_values(['total_bill'], ascending=False).groupby(['day']).cumcount() + 1).query('rn < 3').sort_values(['day','rn']))
    
    (tips.assign(rnk=tips.groupby(['day'])['total_bill'].rank(method='first', ascending=False)).query('rnk < 3').sort_values(['day','rnk']))
    
    UPDATE tips SET tip = tip*2 WHERE tip < 2;
    tips.loc[tips['tip'] < 2, 'tip'] *= 2
    
    DELETE FROM tips WHERE tip > 9;
    tips = tips.loc[tips['tip'] <= 9]
    
    # 插入一列
    df['a'] = 1
    df.insert(1,'a', np.random.randn(6))
    del df['a']
    # 删数据
    df = df.drop(['d','e'], axis=1)
    df = df.drop(df[df['支行'] == '总计'].index) # axis=0 行
    
    # https://pandas.pydata.org/pandas-docs/stable/api.html#groupby
    # 和 reset_index 才能命名列名
    df.groupby(['to'], as_index=False).sum().reset_index()
    # 计数
    df.groupby(['to'], as_index=False)['to'].value_counts()
    df['count'] = df.groupby(['to'], as_index=False)['to'].transform('count')
    df['size'] = df.groupby(['A','B']).transform(np.size)
    
    # 分组计算
    df.groupby(['id','blankid']).aggregate(np.sum)
    
    '''
    df[df[col] > 0.5]:选择col列的值大于0.5的行
    df.sort_values(col1):按照列col1排序数据,默认升序排列
    df.sort_values(col2, ascending=False):按照列col1降序排列数据
    df.sort_values([col1,col2], ascending=[True,False]):先按列col1升序排列,后按col2降序排列数据
    df.groupby(col):返回一个按列col进行分组的Groupby对象
    df.groupby([col1,col2]):返回一个按多列进行分组的Groupby对象
    df.groupby(col1)[col2]:返回按列col1进行分组后,列col2的均值
    df.pivot_table(index=col1, values=[col2,col3], aggfunc=max):创建一个按列col1进行分组,并计算col2和col3的最大值的数据透视表
    df.groupby(col1).agg(np.mean):返回按列col1分组的所有列的均值
    data.apply(np.mean):对DataFrame中的每一列应用函数np.mean
    data.apply(np.max,axis=1):对DataFrame中的每一行应用函数np.max
    '''
    
    # 各组数据量
    df.size()
    # 描述
    df.describe()
    '''
    df.describe():查看数据值列的汇总统计
    df.mean():返回所有列的均值
    df.corr():返回列与列之间的相关系数
    df.count():返回每一列中的非空值的个数
    df.max():返回每一列的最大值
    df.min():返回每一列的最小值
    df.median():返回每一列的中位数
    df.std():返回每一列的标准差
    '''
    
    # df列转series
    df_keyword = self.df[['keyword']]
    df_keyword =df_keyword.squeeze() # 一个值直接返回值数据类型
    if isinstance(df_title, str):
        keywordstr = str(df_title)
    else:
        tmp = df_title.tolist()
        keywordstr = ','.join(tmp)
    
    df_keyword.tolist()
    
    # 缺失值处理
    df.fillna('')
    df.fillna(method='pad', limit=1)
    # pad 前一个数据 bfill 后一个  limit 限制 
    df.fillna(df.mean) # 平均数
    # 插值
    df.interpolate()
    
    # 针对index
    df.interpolate(method='values') time
    
    df.dropna() # axis=1 列
    df.dropna(how='all') #所有列的值都为空 删除该行
    # 找出重复
    df.duplicated()
    # 去重
    df.drop_duplicates(subset=None, keep='first', inplace=False)
    '''
    subset : column label or sequence of labels, optional 
    用来指定特定的列,默认所有列
    keep : {‘first’, ‘last’, False}, default ‘first’ 
    删除重复项并保留第一次出现的项
    inplace : boolean, default False 
    是直接在原来数据上修改还是保留一个副本
    '''
    pd.notnull #时候是null
    # 替换
    df = df.replace('', np.nan)
    df1 = df.replace(np.nan,' ', regex=True) #可以有re
    df.replace(1,2) #不常用 直接apply 更有用
    
    '''
    df.columns = ['a','b','c']:重命名列名
    df.columns.values:获取列名
    df.dtypes
    pd.isnul():检查DataFrame对象中的空值,并返回一个Boolean数组
    pd.notnull():检查DataFrame对象中的非空值,并返回一个Boolean数组
    df.dropna():删除所有包含空值的行
    df.dropna(axis=1):删除所有包含空值的列
    df.dropna(axis=1,thresh=n):删除所有小于n个非空值的行
    df.fillna(x):用x替换DataFrame对象中所有的空值
    s.astype(float):将Series中的数据类型更改为float类型
    s.replace(1,'one'):用‘one’代替所有等于1的值
    s.replace([1,3],['one','three']):用'one'代替1,用'three'代替3
    df.rename(columns=lambda x: x + 1):批量更改列名
    df.rename(columns={'old_name': 'new_ name'}):选择性更改列名
    df.set_index('column_one'):更改索引列
    直接设置成日期
    df['date'] = pd.to_datetime(df['date'])
    df.set_index("date", inplace=True)
    df.add_prefix('x_') :加前缀
    df.add_suffix('_x') :加后缀
    df.rename(index=lambda x: x + 1):批量重命名索引
    '''
    df.selec_dtypes(include='numbers')
    
    # 列操作俩种方式,生成新列操作性强
    dst_ip['addr'] = dst_ip['ip'].apply(lambda x: str(find(x)).split('\t'))
    
    def oneIP(x):
        if x['ip'] != '':
            return x['ip']
        elif x['inip'] != '':
            return x['inip']
    
    df_ip['ip'] = df_ip.apply(oneIP,axis=1)
    # 直接产生list
    df_a = df.groupby('name')['a'].apply(list)
    
    def get_city(x):
        return x
    dstip['city'] = dstip['addr'].apply(get_city) 
    
    def sizes(s): 
        s['size_kb'] = locale.format("%.1f", s['size']/1024.0, grouping=True) + ' KB' 
        s['size_mb'] = locale.format("%.1f", s['size']/1024.0 ** 2, grouping=True) + ' MB' 
        s['size_gb'] = locale.format("%.1f", s['size']/1024.0 ** 3, grouping=True) + ' GB' 
        return s 
        return a, b, c
    
    df_test = df_test.append(rows_list) 
    df_test = df_test.apply(sizes, axis=1) 
    df_test['size_kb'], df_test['size_mb'], df_test['size_gb'] = zip(*df_test['size'].apply(sizes))
    df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
    # 展现类型
    df.style.format("{:.2%}")
    df['c'] = ['a', 'b', 'c', 'd']
    df.style.format({'c': str.upper})
    
    # 广播 所有值操作
    row = df.ix[1] # ix混合索引
    
    df.sub(row,axis=1)
    
    #求哑变量    
    dumiper=pd.get_dummies(df['key'])
    df['key'].join(dumpier)
    #透视表和交叉表 跟sas里面的proc freq步类似  R里面的aggrate和cast函数类似  
    pd.pivot_table()  
    pd.crosstab() #
    # http://python.jobbole.com/81212/
    
    # 将dataframe中,某列进行清洗的命令  
    删除换行符:misc['product_desc'] = misc['product_desc'].str.replace('\n', '')  
    删除字符串前后空格:df["Make"] = df["Make"].map(str.strip)  
    
    # 如果用模糊匹配的话,命令是:
    rpt[rpt['STK_ID'].str.contains(r'^600[0-9]{3}/article>)]
    
    #其他的一些技巧  
    df2[df2['A'].map(lambda x:x.startswith('61'))]  #筛选出以61开头的数据  
    df = df['col'].map('{:,.2f}'.format) #直接做string 格式
    
    # input:
            A   B
    0  [1, 2]  10
    1  [5, 6] -20
    # output:
       A   B
    0  1  10
    1  2  10
    2  5 -20
    3  6 -20
    
    # https://stackoverflow.com/questions/38428796/how-to-do-lateral-view-explode-in-pandas
    # 这是垃圾方法
    A = np.asarray(df.A.values.tolist())
    B = np.stack([df.B for _ in xrange(A.shape[1])]).T
    P = np.stack([A, B])
    pd.Panel(P, items=['A', 'B']).to_frame().reset_index(drop=True)
    
    df1 = df.A.apply(pd.Series).stack().rename('A')
    df2 = df1.to_frame().reset_index(1, drop=True)
    df2.join(df.B).reset_index(drop=True)
    
    pd.DataFrame([[item]+list(df.loc[line,'B':]) for line in df.index for item in df.loc[line,'A']],
                 columns=df.columns)
    
    # 这个推荐在很多场合
    rows = []
    for i, row in df.iterrows():
        for a in row.A:
            rows.append([a, row.B])
    pd.DataFrame(rows, columns=df.columns)
    
    # 0.25版本里可以 #https://pandas.pydata.org/pandas-docs/version/0.25/reference/api/pandas.DataFrame.explode.html
    df.assign(A=df['A'].str.split(',')).explode('A')
    df.assign(A=df['A']).explode('A')
    df.explode('A')
    # 推荐2
    df_tmp = df['A'].apply(pd.Series) #分为多列
    # 再合并起来
    df = pd.concat([df,df_tmp],axis='cloumns')
    
    for i, row in df.iterrows():
            for a in row['ip']:
                row = row.copy()
                row['ipv4'] = a
                rows.append(row)    
    df = pd.DataFrame(rows)
    # https://stackoverflow.com/questions/32468402/how-to-explode-a-list-inside-a-dataframe-cell-into-separate-rows
    df['A'].apply(pd.Series).stack().reset_index(level=2, drop=True).to_frame('A')
    df_tmp = df_vms.ip.apply(pd.Series) #分为多列
    # 再合并起来
    df = pd.concat([df,df_tmp],axis='cloumns')
    
    # categories与set_categories
    # 给categories赋值,可以改变类别标签。赋值的时候是按照顺序进行对应的。a对应very good,b对应good,c对应very bad。操作完成之后,原来的标签a就变成了very good标签。 
    # 
    # 该列为数字
    df['col'].apply(lambda x:x.isalnum())
    df['col'].str.isdigit()
    
    # 
    
    def df_rowspan(df, rowname):
    
        for x in range(0,len(rowname)):
            df['tmp_row'] = df[rowname[:x+1]].values.tolist()
            tmp = df['tmp_row'] != df['tmp_row'].shift() #shift 下一列循环找
            df['tmp_x'] = tmp.astype(int)
            df['x_row'] = tmp.cumsum() #找到后计数 可以累计作为每一个值得index
            df['x_c'] = df.groupby(['x_row']).cumcount(ascending=False) + 1
            df[rowname[x] + '_rowspan'] = df['tmp_x'] * df['x_c']
            df = df.drop(['tmp_row','tmp_x', 'x_row', 'x_c'], axis=1)
            pass
    
        return df
    
    # pandas 方法中最简单的查找是
    # https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
    df.where(df['A']>=df2['B_low'])
    df.where(df['A']>=df2['B_low'])
    df.query('a>b')
    df = pd.merge(df.assign(key=0), df.assign(key=0), on='key').drop('key', axis=1)
    
    # 打印所有
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    
    # https://zhuanlan.zhihu.com/p/32929522
    # 根据要求进行meger
    df1['conm'] = df1['conm'].map(lambda x: get_closest_match(x, df2['conm']))
    # merge two database
    result = pd.merge(df1,df2, on = ['conm','year']) #only matched observations
    result_all = pd.merge(df1,df2, how = 'left', on = ['conm','year'],indicator = 'indicator')#Keep all the left(master file)
    # 多级索引
    # https://blog.csdn.net/Darkman_EX/article/details/80723802
    
    # list split to row
    rows = []
    for i, row in df.iterrows():
        for a in row['iplist']:
            row = row.copy()
            row['ip'] = a
            rows.append(row)
    
    df_vms_ip = pd.DataFrame(rows)
    
    # 多级指标生成树状图
    cols = ['一级指标', '二级指标', '三级指标','四级指标']
        # for name in cols:
            # colname_list = df_zb[name].drop_duplicates().values.tolist()
            # cols_list.append(colname_list)
    
        def tree(df,cols,keys):
            ret = []
            colname_list = df[cols[0]].drop_duplicates().values.tolist()
            for colname in colname_list:
                tmp = {}
                tmp['name'] = colname
                if len(cols)>1:
                    for key in keys:
                        tmp[key] = -1
                    tmp['children'] = tree(df[df[cols[0]]==colname],cols[1:],keys)
                else:
                    for key in keys:
                        tmp[key] = df[df[cols[0]]==colname][key].values.tolist()[0]
                ret.append(tmp)
            return ret
    
        pprint.pprint(tree(df_zb, cols, ['id']))
    
        df.pipe(PrettyPandas).as_currency('GBP', subset='A').as_percent(subset='B').total().average()
    
        # 前后差1
        df_title['a'] = df_title['ind'].diff() == 1
    
        # https://www.chzzz.club/post/167.html
        get_dummies( ) #将分类变量编码为虚拟(二进制)变量
        factorize #将分类变量编码为数字
    
        import pandas_profiling
    

    相关文章

      网友评论

        本文标题:pandas sql使用笔记

        本文链接:https://www.haomeiwen.com/subject/kaeinctx.html