美文网首页
DataFrame练习

DataFrame练习

作者: dechuan | 来源:发表于2018-12-11 09:31 被阅读0次

    最近用python写了个小程序,顺便回顾一下语法,做个记录。

    1. 如何读取更换目录并读取目录下的文件

    import os
    print(os.getcwd()) # 打印当前工作目录
    os.chdir('D:\warning data') # 将当前工作目录改变为`D:\warning data`
    print(os.listdir())
    df = pd.read_excel('alarm20180915-1130.xls')
    

    2. 如何读取excel中的特定列特定值

    SystemDf=df['FIRSTOCCURRENCE']
    
    • 得到结果
    0       2018-09-15 00:00:49
    1       2018-09-15 00:39:05
    2       2018-09-15 00:57:22
    3       2018-09-15 01:00:55
    4       2018-09-15 01:05:41
    5       2018-09-15 01:05:41
    
    #带限制条件选择列
    SystemDf=df.loc[(df['STRINGA']=='P1')|(df['STRINGA']=='P2')|(df['STRINGA']=='P3'),['SUMMARY','FIRSTOCCURRENCE']] 
    #选择一列中的每一行
    for i in SystemDf.index:
        print(SystemDf['FIRSTOCCURRENCE'][i])
    
    #选择某一列中特定值所对应的信息
    PartUsage=df.loc[frame2['PARTID'] == '159']
    
    • 得到结果
                             USAGE
    PARTID FIRSTOCCURRENCE        
    159    2018-09-15       0.7825
           2018-09-16       0.7880
           2018-09-17       0.7940
           2018-11-26       0.7000
           2018-11-27       0.7045
           2018-11-28       0.7115
           2018-11-29       0.7185
           2018-11-30       0.7220
    

    3. 如何从原excel中抽取有用信息并存成新的excel

    #因为后期会有大量调用操作,建议将从原excel中抽取的信息存成字典格式,再使用to_csv将其保存为csv,得到列名为字典key值,每一列为value的符合excel读取方式又便于后期程序调用的文件
    import re
    diction={}
    diction['FIRSTOCCURRENCE']=[]
    diction['TIME']=[]
    diction['LPAR']=[]
    diction['ID']=[]
    diction['MessageID']=[]
    diction['Message']=[]
    outfile=open('outfile.csv','w',encoding = 'utf-8')
    outlier=open('outlier.csv','w',encoding = 'utf-8')
    filter='(.*?\d)\s+(CP[1-3][A-F])\s+([SJ]\S+)\s+(\S+[^\:])\s+(.*)' #最终版,把大部分的内容筛出来
    #outfile.write('TIME;LPAR;ID;MessageID;Message;')
    #outfile.write(u'\n')
    for i in SystemDf.index:
        pattern=re.compile(filter)
        result=pattern.search(SystemDf['SUMMARY'][i])
        if result:
            FIRSTOCCURRENCE=SystemDf2['FIRSTOCCURRENCE'][i]
            Time= result.group(1)
            LPAR= result.group(2)
            ID=result.group(3)
            MessageID=result.group(4)
            Message=result.group(5)
            diction['FIRSTOCCURRENCE'].append(FIRSTOCCURRENCE)
            diction['TIME'].append(Time)
            diction['LPAR'].append(LPAR)
            diction['ID'].append(ID)
            diction['MessageID'].append(MessageID)
            diction['Message'].append(Message)
        else:
            outlier.write(SystemDf['SUMMARY'][i])
            outlier.write(u'\n')
    #print('TIME;LPAR;ID;MessageID;Message',file=outfile)
    frame=DataFrame(diction)
    frame.to_csv(outfile,index=False,columns=['FIRSTOCCURRENCE','TIME','LPAR','ID','MessageID','Message'])
    
    outfile.close()
    outlier.close() 
    print('game over')
    summ=frame.groupby(['MessageID']).size()
    print(summ.sort_values(ascending=False)) #降序排列,选出出现最多的告警
    
    #或者直接如下更好,用字典的格式建立一个DataFrame,要获取的是每个key对应的列表。
    import re
    row1=[]
    row2=[]
    row3=[]
    row4=[]
    row5=[]
    row6=[]
    outfile=open('outfile.csv','w',encoding = 'utf-8')
    outlier=open('outlier.csv','w',encoding = 'utf-8')
    filter1='(.*?\d)\s+(CP[1-3][A-F])\s+([SJ]\S+)\s+(\S+[^\:])\s+(.*)' #最终版,把大部分的内容筛出来
    for i in SystemDf.index:
        pattern=re.compile(filter1)
        result=pattern.search(SystemDf['SUMMARY'][i])
        if result:
            FIRSTOCCURRENCE=SystemDf2['FIRSTOCCURRENCE'][i]
            Time= result.group(1)
            LPAR= result.group(2)
            ID=result.group(3)
            MessageID=result.group(4)
            Message=result.group(5)
            row1.append(FIRSTOCCURRENCE)
            row2.append(Time)
            row3.append(LPAR)
            row4.append(ID)
            row5.append(MessageID)
            row6.append(Message)
        else:
            outlier.write(SystemDf['SUMMARY'][i])
            outlier.write(u'\n')
    frame1=DataFrame({'FIRSTOCCURRENCE':row1,'TIME':row2,'LPAR':row3,'ID':row4,'MessageID':row5,'Message':row6})
    frame1.to_csv(outfile,index=False,columns=['FIRSTOCCURRENCE','TIME','LPAR','ID','MessageID','Message'])
    
    outfile.close()
    outlier.close() 
    print('game over')
    summ=frame1.groupby(['MessageID']).size()
    print(summ.sort_values(ascending=False)) #降序排列,选出出现最多的告警
    
    • 结果如下
     game over
    MessageID
    +DBA:BANCS     768
    BNROUTACTS0    645
    *IEF099I       548
    +DBA:EISS      215
    DNFO1777E      144
    BNEEPUACTS0    144
    +CSQX209E      108
    DNFO1709E      103
    DNFF4141I      100
    ...
    

    4. 使用透视表获得聚合后的数据结果

    row1=[]
    row2=[]
    row3=[]
    filter2='PART\s(\d+)\s(.*?)\s'
    pattern=re.compile(filter2)
    filter3='(\d+)\-(\d+)\-(\d+)\s'
    pattern2=re.compile(filter3)
    for i in DBAWarning.index:
        DBAMess=pattern.search(DBAWarning['Message'][i])
        DBATime=pattern2.search(str(DBAWarning['FIRSTOCCURRENCE'][i]))
        if DBAMess:
            row1.append(DBAMess.group(1))
            #将字符串格式的使用率(如80%)转化成数字格式,如果是字符串的话无法使用pivot_table
            usage=float(DBAMess.group(2).strip('%'))/100
            p_float=round(usage,3)
            #p_float=float('%.3f' % usage)
            #保留小数点后面3位
            row2.append(p_float)
            row3.append(DBATime.group(0))
    frame2=DataFrame({'PARTID':row1,'USAGE':row2,'FIRSTOCCURRENCE':row3})
    FrameResult=open('frame2.csv','w',encoding = 'utf-8')
    frame2.to_csv(FrameResult,index=False)
    FrameResult.close() 
    StatisticDBA=pd.pivot_table(frame2,index=['PARTID','FIRSTOCCURRENCE'],values=['USAGE'])
    Input=frame2.loc[frame2['PARTID'] == '159']
    PartUsage=pd.pivot_table(Input,index=['PARTID','FIRSTOCCURRENCE'],values=['USAGE'])
    print(PartUsage)
    
    • 得到结果
                            USAGE
    PARTID FIRSTOCCURRENCE        
    159    2018-09-15       0.7825
           2018-09-16       0.7880
           2018-09-17       0.7940
           2018-11-26       0.7000
           2018-11-27       0.7045
           2018-11-28       0.7115
           2018-11-29       0.7185
           2018-11-30       0.7220
    

    相关文章

      网友评论

          本文标题:DataFrame练习

          本文链接:https://www.haomeiwen.com/subject/ywxshqtx.html