【问题描述】:将多个Excel中的数据清理后导入到SQL server 数据库
【
Excel文件数据格式(红色框中为所需数据):
1587977946(1).jpg
数据库库表结构:
1587978120(1).jpg
代码:
import os
import pandas as pd
import datetime
from sqlalchemy import create_engine
import pymssql
import numpy as np
def split_Z(data):
str_list = data.split('.')
if len(str_list) > 1:
[integer, fraction] = str_list
else:
integer, fraction = None, str_list[0]
return integer, fraction
def data_convert_pd(files_name):
# 读取excel文件
df = pd.read_excel(files_name)
# 获取年份,站号
year = df.iloc[0, 2]
stcd = df.iloc[0, 6]
# 选取所需数据
df.drop([0, 1, 2, 3], inplace=True)
df = df[df.columns[[1, 2, 3, 4, 7, 8]]]
# 重新赋值列名
df.columns = ['month', 'day', 'time_start', 'time_end', 'Z', 'Q']
# 新增站号、年份到DataFrame
df['year'] = year
df['STCD'] = stcd
# 原始表中月、日会省略,出现nan值,在这里进行前向填充
df['month'].fillna(method='ffill', inplace=True)
df['day'].fillna(method='ffill', inplace=True)
# 清除脏数据
df.dropna(inplace=True)
# 取平均时间
time_start = pd.to_datetime(df['time_start'], format='%H:%M')
time_end = pd.to_datetime(df['time_end'], format='%H:%M')
df['time'] = (time_start + (time_end - time_start)/2)
# 水位数据在整数相同的情况下,后面数据会自动省略整数部分,这里进行补整处理
df_z = df.apply(lambda x: split_Z(x.Z), axis=1, result_type='expand').fillna(method='ffill')
df.Z = df_z.iloc[:, 0] + '.' + df_z.iloc[:, 1]
# 整合日期时间
df['TM'] = df.apply(lambda x: datetime.datetime(x.year, x.month, x.day, x.time.hour, x.time.minute, x.time.second), axis=1)
df['WPTN'] = 6
df['MSQMT'] = 3
# 选取所需列数据
df = df[['STCD', 'TM', 'Z', 'Q', 'WPTN', 'MSQMT']]
return df
def data_to_db(data):
# data --> pandas.DataFrame
# 创建engine
engine_k = create_engine("mssql+pymssql://user:password@hostname/dbtable", echo=True)
# ! 入库操作前做好数据备份,
# if_exits 参数慎重选择,否则会删除原有库表
# * replace: Drop the table before inserting new values.
# * append: Insert new values to the existing table.
data.to_sql(name='ST_RIVER_R', con=engine_k, if_exists='append', index=None)
if __name__ == "__main__":
# 获取所有excel文件的路径, 可以使用Glob包
files = [os.path.join('实测流量', f) for f in os.listdir('实测流量')]
# 读取Excel数据,并合并到一个DataFrame
df = pd.concat([data_convert_pd(f) for f in files], ignore_index=True)
# 根据实际情况修改部分数据
df.replace({'90300600': '90520600',
'90507600': '90527600',
'90507650': '90527650',
'90507800': '90527800'}, inplace=True)
df.replace(datetime.datetime(2018, 2, 4, 14, 0, 0), datetime.datetime(2018, 2, 4, 14, 1, 0), inplace=True)
# 存入SQL Server 数据库
data_to_db(df)
网友评论