美文网首页
Python-121 先对两个excel合并,删除trim空格,

Python-121 先对两个excel合并,删除trim空格,

作者: RashidinAbdu | 来源:发表于2021-09-13 12:55 被阅读0次
    1. 合并指定的两个excel:
    #参考https://blog.csdn.net/milton2017/article/details/54406482
    #merge two files
    
    import pandas as pd
    import os
    firstname="English-57-3-ZM-XN-D5-91-SCFAs-结果"
    firstname_No="Taxon_"+"14.3_"+firstname
    second="English-57-3-ZM-XN-D5-91-SCFAs-结果"
    #first = firstname +'-SCFA'
    #6271270
    path1="D:\\20210911-FINAL\\scfa\\"+firstname+".xlsx"
    path2="D:\\20210911-FINAL\\scfa\\"+second+".xlsx"
    df1 = pd.DataFrame(pd.read_excel(path1))
    df2 = pd.DataFrame(pd.read_excel(path2))
    print(df1)
    print(df2)
    frames=[df1,df2]
    #final_file=df1.append(df2)可以上下合并
    final_file=pd.concat(frames)
    #final_file = pd.merge(df1, df2)
    final_file.to_excel('D:\\20210911-FINAL\\merged\\'+firstname+'-SPME-SCFA-Merged.xlsx', index=False)
    #print with color
    import sys 
    def prRed(skk): print("\033[91m {}\033[00m" .format(skk))
    prRed(final_file)
    
    import pandas as pd
    dffnl = pd.read_excel('D:\\20210911-FINAL\\merged\\'+firstname+'-SPME-SCFA-Merged.xlsx')
    print("Here is the df:\n",dffnl)
    
    #删除空格等Trim the free spaces in front of the data
    cols = dffnl.select_dtypes(['object']).columns
    dffnl[cols] = dffnl[cols].apply(lambda x: x.str.strip())
    #df1=df.sort_index(axis=1)
    #df.sort_values(by='ID')
    dffnl1 = dffnl.sort_values(by=['Name'])
    print("Here is the df1:\n",dffnl1)
    merged_path='D:\\20210911-FINAL\\merged\\'+firstname_No+'-SPME-SCFA-Merged.xlsx'
    dffnl1.to_excel(merged_path)
    
    import pandas as pd
    df = pd.read_excel(merged_path)
    print("Here is the df:\n",df)
    
    #trim
    cols = df.select_dtypes(['object']).columns # Trim the free spaces in front of the data
    df[cols] = df[cols].apply(lambda x: x.str.strip())
    #df1=df.sort_index(axis=1)
    #df.sort_values(by='ID')
    df1 = df.sort_values(by=['Name'])
    print("Here is the df1:\n",df1)
    df1.to_excel(merged_path)
    
    ## 1 Remove duplicates on all columns
    #df.drop_duplicates()
    import sys #print with color
    def prRed(skk): print("\033[91m {}\033[00m" .format(skk))
    dfx = pd.read_excel(merged_path)
    #2 Remove duplicates on select column
    df2=dfx.drop_duplicates('Name', keep='last')
    prRed(df2)
    df2.to_excel(merged_path)
    

    2.进行VLOOKUP:
    0)将所有excel文件的文件名提取到指定表格中;
    # 打开目标目录并且对其中的所有文件提取其文件名,并把文件名写到xl文件
    import os
    file_dir = "D:\\20210911-FINAL\\merged\\Taxon-dropduplicated\\"
    for root, dirs, files in os.walk(file_dir, topdown=False):
        print(root)     # 当前目录路径
        print(dirs)     # 当前目录下所有子目录
        print(files)        # 当前路径下所有非目录子文件
    f = open (r'D:\\20210911-FINAL\\Names_all.txt','w')
    print (files,file = f)
    f.close()
    
    #重开个txt文件,进行换行,进行替换,来调整格式,
    fin = open('D:\\20210911-FINAL\\Names_all.txt', "rt") #打开一个txt文件
    data = fin.read()#读取txt
    #replace all occurrences of the required string
    data = data.replace(".xlsx', '", "\n") #找到特殊字符进行替换
    #data = data.replace(".qgd', '", "\n") #找到特殊字符进行替换
    data = data.replace(".xlsx']", "")
    #data = data.replace("['", "")
    #close the input file
    fin.close()
    #open the input file in write mode
    fin = open('D:\\20210911-FINAL\\Names_all.txt', "wt")
    #overrite the input file with the resulting data
    fin.write(data)#将处理好的结果再写进前面的txt内。
    #close the file
    fin.close()
    print("Name of data:",data)
    
    import os
    import xlwt
    #p="D:\\20210911-FINAL\\scfa\\"#
    datas=os.listdir('D:\\20210911-FINAL\\merged\\Taxon-dropduplicated\\')
    datas=list(datas)
    a = xlwt.Workbook(encoding='utf-8')
    s = a.add_sheet('namelist',cell_overwrite_ok=True)
    i=0
    for x in datas:
            s.write(i,0,x)
            i=i+1
    a.save('D:\\20210911-FINAL\\merged\\Merged-SCFA-SPME-list-913.xls')
    
    
    • 这里得到的list用来下一段赋值!
    • 对于出现的重复值进行drop-duplicates删除重复值操作(保留最终出现的那个值):
    import pandas as pd
    df = pd.read_excel('D:\\20210911-FINAL\\Dropduplicates.xlsx')
    print("Here is the df:\n",df)
    
    #trim
    cols = df.select_dtypes(['object']).columns # Trim the free spaces in front of the data
    df[cols] = df[cols].apply(lambda x: x.str.strip())
    #df1=df.sort_index(axis=1)
    #df.sort_values(by='ID')
    df1 = df.sort_values(by=['Name'])
    print("Here is the df1:\n",df1)
    df1.to_excel("D:\\20210911-FINAL\\Dropduplicates-1.xlsx")
    
    ## 1 Remove duplicates on all columns
    #df.drop_duplicates()
    import sys #print with color
    def prRed(skk): print("\033[91m {}\033[00m" .format(skk))
    dfx = pd.read_excel('D:\\20210911-FINAL\\Dropduplicates-1.xlsx')
    #2 Remove duplicates on select column
    df2=dfx.drop_duplicates('Name', keep='last')
    prRed(df2)
    df2.to_excel('D:\\20210911-FINAL\\Dropduplicates-2.xlsx')
    


    • 下面完整的代码进行VLOOKUP相关过程:

    1)读取指定文件夹中的excel文件(每次只需要将文件名赋值,而路径对所有文件都沿用);
    2)将化学物质的列提到第一列;
    3)将菌株编号替换到peakarea的列名;
    4)Name列进行Trim, 即:Trim the free spaces in front of the data
    5)上面已经得到了chem在第一列的没问题的数据!然后进行vlookup!
    
    import pandas as pd
    import os
    excelName = "Taxon_14.3_English-57-3-ZM-XN-D5-91-SCFAs-结果-SPME-SCFA-Merged"
    path="C:\\Users\\Mr.R\\Desktop\\test910\\"+excelName+".xlsx"
    df= pd.read_excel(path)
    df.head()
    print(df)
    #将化学物质的列提到第一列
    Name = df['Name']
    df.drop(labels=['Name'], axis=1,inplace = True)
    df.insert(0, 'Name', Name)
    print(df)
    newpath="C:\\Users\\Mr.R\\Desktop\\final\\draft\\"+excelName+"final.xlsx"
    df.to_excel(newpath)
    
    from openpyxl import load_workbook
    path1 = r"C:\\Users\\Mr.R\\Desktop\\final\\draft\\"+excelName+"final.xlsx"
    e= load_workbook(path1) #打开excel
    E= e.active   #表示当前活跃的表,本案例中 当前活跃表为sheet1
    #也可以使用 E = e.get_sheet_by_name('Sheet1') 来获取工作表1
    #将excel中 2行3列 对应的数据传给a
    a = str(E.cell(row=2, column=4).value)
    #b=E.max_row         #读取excel行数
    #c=E.max_column     #读取excel列数
    print(a)# 将上面获取值给单元格 1,7:
    wb=load_workbook(path1) #打开指定excel表
    sheet=wb["Sheet1"] #打开指定的Excel表中的sheet页
    sheet.cell(1, 9).value=a  #修改第二行第三列的数据为hhh
    print(sheet.cell(1,9).value)   #同时工作台打印出被修改的单元格值
    update=sheet.cell(1,9).value  #将修改后的值赋值update
    print(update)  #update值#### 进行合并 # 1 source table
    path2 = r"C:\\Users\\Mr.R\\Desktop\\final\\draft\\"+excelName+"final2.xlsx"
    wb.save(path2)
    
    import pandas as pd
    dffnl = pd.read_excel("C:\\Users\\Mr.R\\Desktop\\final\\draft\\"+excelName+"final2.xlsx")
    print("Here is the df:\n",dffnl)
    
    #Trim the free spaces in front of the data
    cols = dffnl.select_dtypes(['object']).columns
    dffnl[cols] = dffnl[cols].apply(lambda x: x.str.strip())
    #df1=df.sort_index(axis=1)
    #df.sort_values(by='ID')
    dffnl1 = dffnl.sort_values(by=['Name'])
    ("Here is the df1:\n",dffnl1)
    dffnl1.to_excel("C:\\Users\\Mr.R\\Desktop\\final\\ChemFirst\\"+excelName+"-ChemFirst.xlsx")
    
    # import pandas
    import pandas as pd
    from openpyxl import *
    # read csv data
    from openpyxl import load_workbook
    path = r"C:\\Users\\Mr.R\\Desktop\\final\\ChemFirst\\"+excelName+"-ChemFirst.xlsx"
    e= load_workbook(path) #打开excel
    E= e.active   #表示当前活跃的表,本案例中 当前活跃表为sheet1
                        #也可以使用 E = e.get_sheet_by_name('Sheet1') 来获取工作表1
    b = str(E.cell(row=2, column=7).value) # 将excel中 2行13列 对应的数据传给a
    #b=E.max_row         #读取excel行数
    #c=E.max_column     #读取excel列数
    print(b)# 将上面获取值给单元格 1,7:
    wb=load_workbook("C:\\Users\\Mr.R\\Desktop\\final\\ChemFirst\\"+excelName+"-ChemFirst.xlsx") #打开指定excel表
    sheet=wb["Sheet1"] #打开指定的Excel表中的sheet页
    sheet.cell(1, 13).value=b  #修改第二行第三列的数据为hhh
    wb.save("C:\\Users\\Mr.R\\Desktop\\final\\ChemFirst\\"+excelName+"-ChemFirst.xlsx")  #保存指定Excel工作表
    print(sheet.cell(1, 13).value)   #同时工作台打印出被修改的单元格值
    update=sheet.cell(1, 13).value  #将修改后的值赋值update
    print(update)  #update值#### 进行合并 # 1 source table
    df1 = pd.read_excel("C:\\Users\\Mr.R\\Desktop\\final\\ChemFirst\\"+excelName+"-ChemFirst.xlsx")
    print(df1)
    #上面已经得到了chem在第一列的没问题的数据!
    #vlookup
    import pandas as pd
    # 地区信息表
    strain=excelName
    df_location = pd.read_excel("D:\\20210911-FINAL\\Vlookup-SCFA-SPME-list-part1-913.xlsx" )
    df_location.head()
    # 数据库导出表
    strain_path="C:\\Users\\Mr.R\\Desktop\\final\\ChemFirst\\"+excelName+"-ChemFirst.xlsx"
    df_number = pd.read_excel(strain_path)
    df_number.head()
    print(df_number)
    # 只筛选第二个表的少量的列(只选取表二中市区和用户人数)
    df_number = df_number[[ "Name", b]]
    df_number.head()
    df_merge = pd.merge(left=df_location, right=df_number, left_on="Name", right_on="Name",how='outer')
    df_merge.head()
    df_merge.to_excel("D:\\20210911-FINAL\\Vlookup-SCFA-SPME-list-part1-913.xlsx", index=False)
    
    这样就完整的结束了!

    • 有个问题:提取文件夹里的所有文件的名称,并用循环逐个提取文件名,并VLOOKUP第一步进行赋值,但是这个是只是坐到逐个提取了,但进行赋值报错!待解决!

    • 逐个提取文件名:
    #读取所有文件名
    import os
    def readname():
        filePath = 'C:\\Users\\Mr.R\\Desktop\\test910\\'
        name = os.listdir(filePath)
        return name
    if __name__ == "__main__":
        name = readname()
        print(name)
        for i in name:
           print(i)
    
    但是这里的输出为一连串字符:

    image.png

    • 逐行读取txt文件:
    # 方法一 readline() :
    # 打开文件
    f = open("C:\\Users\\Mr.R\\Desktop\\final\\1.txt")
    # 调用文件的 readline()方法
    line = f.readline()  # 每次读取一行内容
    while line:
        print(line, end='')  # end = ''表示不换行
        # print(line)  # 默认换行
        line = f.readline()
    f.close()
    
    
    

    相关文章

      网友评论

          本文标题:Python-121 先对两个excel合并,删除trim空格,

          本文链接:https://www.haomeiwen.com/subject/ssqfgltx.html