1. 合并指定的两个excel:
#参考https://blog.csdn.net/milton2017/article/details/54406482
#merge two files
import pandas as pd
import os
firstname="English-57-3-ZM-XN-D5-91-SCFAs-结果"
firstname_No="Taxon_"+"14.3_"+firstname
second="English-57-3-ZM-XN-D5-91-SCFAs-结果"
#first = firstname +'-SCFA'
#6271270
path1="D:\\20210911-FINAL\\scfa\\"+firstname+".xlsx"
path2="D:\\20210911-FINAL\\scfa\\"+second+".xlsx"
df1 = pd.DataFrame(pd.read_excel(path1))
df2 = pd.DataFrame(pd.read_excel(path2))
print(df1)
print(df2)
frames=[df1,df2]
#final_file=df1.append(df2)可以上下合并
final_file=pd.concat(frames)
#final_file = pd.merge(df1, df2)
final_file.to_excel('D:\\20210911-FINAL\\merged\\'+firstname+'-SPME-SCFA-Merged.xlsx', index=False)
#print with color
import sys
def prRed(skk): print("\033[91m {}\033[00m" .format(skk))
prRed(final_file)
import pandas as pd
dffnl = pd.read_excel('D:\\20210911-FINAL\\merged\\'+firstname+'-SPME-SCFA-Merged.xlsx')
print("Here is the df:\n",dffnl)
#删除空格等Trim the free spaces in front of the data
cols = dffnl.select_dtypes(['object']).columns
dffnl[cols] = dffnl[cols].apply(lambda x: x.str.strip())
#df1=df.sort_index(axis=1)
#df.sort_values(by='ID')
dffnl1 = dffnl.sort_values(by=['Name'])
print("Here is the df1:\n",dffnl1)
merged_path='D:\\20210911-FINAL\\merged\\'+firstname_No+'-SPME-SCFA-Merged.xlsx'
dffnl1.to_excel(merged_path)
import pandas as pd
df = pd.read_excel(merged_path)
print("Here is the df:\n",df)
#trim
cols = df.select_dtypes(['object']).columns # Trim the free spaces in front of the data
df[cols] = df[cols].apply(lambda x: x.str.strip())
#df1=df.sort_index(axis=1)
#df.sort_values(by='ID')
df1 = df.sort_values(by=['Name'])
print("Here is the df1:\n",df1)
df1.to_excel(merged_path)
## 1 Remove duplicates on all columns
#df.drop_duplicates()
import sys #print with color
def prRed(skk): print("\033[91m {}\033[00m" .format(skk))
dfx = pd.read_excel(merged_path)
#2 Remove duplicates on select column
df2=dfx.drop_duplicates('Name', keep='last')
prRed(df2)
df2.to_excel(merged_path)
2.进行VLOOKUP:
0)将所有excel文件的文件名提取到指定表格中;
# 打开目标目录并且对其中的所有文件提取其文件名,并把文件名写到xl文件
import os
file_dir = "D:\\20210911-FINAL\\merged\\Taxon-dropduplicated\\"
for root, dirs, files in os.walk(file_dir, topdown=False):
print(root) # 当前目录路径
print(dirs) # 当前目录下所有子目录
print(files) # 当前路径下所有非目录子文件
f = open (r'D:\\20210911-FINAL\\Names_all.txt','w')
print (files,file = f)
f.close()
#重开个txt文件,进行换行,进行替换,来调整格式,
fin = open('D:\\20210911-FINAL\\Names_all.txt', "rt") #打开一个txt文件
data = fin.read()#读取txt
#replace all occurrences of the required string
data = data.replace(".xlsx', '", "\n") #找到特殊字符进行替换
#data = data.replace(".qgd', '", "\n") #找到特殊字符进行替换
data = data.replace(".xlsx']", "")
#data = data.replace("['", "")
#close the input file
fin.close()
#open the input file in write mode
fin = open('D:\\20210911-FINAL\\Names_all.txt', "wt")
#overrite the input file with the resulting data
fin.write(data)#将处理好的结果再写进前面的txt内。
#close the file
fin.close()
print("Name of data:",data)
import os
import xlwt
#p="D:\\20210911-FINAL\\scfa\\"#
datas=os.listdir('D:\\20210911-FINAL\\merged\\Taxon-dropduplicated\\')
datas=list(datas)
a = xlwt.Workbook(encoding='utf-8')
s = a.add_sheet('namelist',cell_overwrite_ok=True)
i=0
for x in datas:
s.write(i,0,x)
i=i+1
a.save('D:\\20210911-FINAL\\merged\\Merged-SCFA-SPME-list-913.xls')
-
这里得到的list用来下一段赋值!
- 对于出现的重复值进行drop-duplicates删除重复值操作(保留最终出现的那个值):
import pandas as pd
df = pd.read_excel('D:\\20210911-FINAL\\Dropduplicates.xlsx')
print("Here is the df:\n",df)
#trim
cols = df.select_dtypes(['object']).columns # Trim the free spaces in front of the data
df[cols] = df[cols].apply(lambda x: x.str.strip())
#df1=df.sort_index(axis=1)
#df.sort_values(by='ID')
df1 = df.sort_values(by=['Name'])
print("Here is the df1:\n",df1)
df1.to_excel("D:\\20210911-FINAL\\Dropduplicates-1.xlsx")
## 1 Remove duplicates on all columns
#df.drop_duplicates()
import sys #print with color
def prRed(skk): print("\033[91m {}\033[00m" .format(skk))
dfx = pd.read_excel('D:\\20210911-FINAL\\Dropduplicates-1.xlsx')
#2 Remove duplicates on select column
df2=dfx.drop_duplicates('Name', keep='last')
prRed(df2)
df2.to_excel('D:\\20210911-FINAL\\Dropduplicates-2.xlsx')
1)读取指定文件夹中的excel文件(每次只需要将文件名赋值,而路径对所有文件都沿用);
2)将化学物质的列提到第一列;
3)将菌株编号替换到peakarea的列名;
4)Name列进行Trim, 即:Trim the free spaces in front of the data
5)上面已经得到了chem在第一列的没问题的数据!然后进行vlookup!
import pandas as pd
import os
excelName = "Taxon_14.3_English-57-3-ZM-XN-D5-91-SCFAs-结果-SPME-SCFA-Merged"
path="C:\\Users\\Mr.R\\Desktop\\test910\\"+excelName+".xlsx"
df= pd.read_excel(path)
df.head()
print(df)
#将化学物质的列提到第一列
Name = df['Name']
df.drop(labels=['Name'], axis=1,inplace = True)
df.insert(0, 'Name', Name)
print(df)
newpath="C:\\Users\\Mr.R\\Desktop\\final\\draft\\"+excelName+"final.xlsx"
df.to_excel(newpath)
from openpyxl import load_workbook
path1 = r"C:\\Users\\Mr.R\\Desktop\\final\\draft\\"+excelName+"final.xlsx"
e= load_workbook(path1) #打开excel
E= e.active #表示当前活跃的表,本案例中 当前活跃表为sheet1
#也可以使用 E = e.get_sheet_by_name('Sheet1') 来获取工作表1
#将excel中 2行3列 对应的数据传给a
a = str(E.cell(row=2, column=4).value)
#b=E.max_row #读取excel行数
#c=E.max_column #读取excel列数
print(a)# 将上面获取值给单元格 1,7:
wb=load_workbook(path1) #打开指定excel表
sheet=wb["Sheet1"] #打开指定的Excel表中的sheet页
sheet.cell(1, 9).value=a #修改第二行第三列的数据为hhh
print(sheet.cell(1,9).value) #同时工作台打印出被修改的单元格值
update=sheet.cell(1,9).value #将修改后的值赋值update
print(update) #update值#### 进行合并 # 1 source table
path2 = r"C:\\Users\\Mr.R\\Desktop\\final\\draft\\"+excelName+"final2.xlsx"
wb.save(path2)
import pandas as pd
dffnl = pd.read_excel("C:\\Users\\Mr.R\\Desktop\\final\\draft\\"+excelName+"final2.xlsx")
print("Here is the df:\n",dffnl)
#Trim the free spaces in front of the data
cols = dffnl.select_dtypes(['object']).columns
dffnl[cols] = dffnl[cols].apply(lambda x: x.str.strip())
#df1=df.sort_index(axis=1)
#df.sort_values(by='ID')
dffnl1 = dffnl.sort_values(by=['Name'])
("Here is the df1:\n",dffnl1)
dffnl1.to_excel("C:\\Users\\Mr.R\\Desktop\\final\\ChemFirst\\"+excelName+"-ChemFirst.xlsx")
# import pandas
import pandas as pd
from openpyxl import *
# read csv data
from openpyxl import load_workbook
path = r"C:\\Users\\Mr.R\\Desktop\\final\\ChemFirst\\"+excelName+"-ChemFirst.xlsx"
e= load_workbook(path) #打开excel
E= e.active #表示当前活跃的表,本案例中 当前活跃表为sheet1
#也可以使用 E = e.get_sheet_by_name('Sheet1') 来获取工作表1
b = str(E.cell(row=2, column=7).value) # 将excel中 2行13列 对应的数据传给a
#b=E.max_row #读取excel行数
#c=E.max_column #读取excel列数
print(b)# 将上面获取值给单元格 1,7:
wb=load_workbook("C:\\Users\\Mr.R\\Desktop\\final\\ChemFirst\\"+excelName+"-ChemFirst.xlsx") #打开指定excel表
sheet=wb["Sheet1"] #打开指定的Excel表中的sheet页
sheet.cell(1, 13).value=b #修改第二行第三列的数据为hhh
wb.save("C:\\Users\\Mr.R\\Desktop\\final\\ChemFirst\\"+excelName+"-ChemFirst.xlsx") #保存指定Excel工作表
print(sheet.cell(1, 13).value) #同时工作台打印出被修改的单元格值
update=sheet.cell(1, 13).value #将修改后的值赋值update
print(update) #update值#### 进行合并 # 1 source table
df1 = pd.read_excel("C:\\Users\\Mr.R\\Desktop\\final\\ChemFirst\\"+excelName+"-ChemFirst.xlsx")
print(df1)
#上面已经得到了chem在第一列的没问题的数据!
#vlookup
import pandas as pd
# 地区信息表
strain=excelName
df_location = pd.read_excel("D:\\20210911-FINAL\\Vlookup-SCFA-SPME-list-part1-913.xlsx" )
df_location.head()
# 数据库导出表
strain_path="C:\\Users\\Mr.R\\Desktop\\final\\ChemFirst\\"+excelName+"-ChemFirst.xlsx"
df_number = pd.read_excel(strain_path)
df_number.head()
print(df_number)
# 只筛选第二个表的少量的列(只选取表二中市区和用户人数)
df_number = df_number[[ "Name", b]]
df_number.head()
df_merge = pd.merge(left=df_location, right=df_number, left_on="Name", right_on="Name",how='outer')
df_merge.head()
df_merge.to_excel("D:\\20210911-FINAL\\Vlookup-SCFA-SPME-list-part1-913.xlsx", index=False)
这样就完整的结束了!
-
有个问题:提取文件夹里的所有文件的名称,并用循环逐个提取文件名,并VLOOKUP第一步进行赋值,但是这个是只是坐到逐个提取了,但进行赋值报错!待解决!
#读取所有文件名
import os
def readname():
filePath = 'C:\\Users\\Mr.R\\Desktop\\test910\\'
name = os.listdir(filePath)
return name
if __name__ == "__main__":
name = readname()
print(name)
for i in name:
print(i)
但是这里的输出为一连串字符:
image.png
# 方法一 readline() :
# 打开文件
f = open("C:\\Users\\Mr.R\\Desktop\\final\\1.txt")
# 调用文件的 readline()方法
line = f.readline() # 每次读取一行内容
while line:
print(line, end='') # end = ''表示不换行
# print(line) # 默认换行
line = f.readline()
f.close()
网友评论