已有存储于如图十个文件的不同站点逐日数据,
![](https://img.haomeiwen.com/i149566/0cd3da732d0b45f7.png)
表头如下:
![](https://img.haomeiwen.com/i149566/ac73977d68060700.png)
以及各区站号对应省份(dic.txt),表头为
![](https://img.haomeiwen.com/i149566/6f963c4b81e71893.png)
需求:
各省份的逐日降水量,存储为txt
思路及源码如下:
1、将同一目录下的所有doc/docx文件转换成txt
# -*- coding: utf-8 -*-
from win32com import client as wc
import os
import fnmatch
all_FileNum = 0
debug = 0
def Translate(path):
'''''
将一个目录下所有doc和docx文件转成txt
该目录下创建一个新目录newdir
新目录下fileNames.txt创建一个文本存入所有的word文件名
本版本具有一定的容错性,即允许对同一文件夹多次操作而不发生冲突
'''
global debug, all_FileNum
if debug:
print(path)
#该目录下所有文件的名字
files = os.listdir(path)
#该目下创建一个新目录newdir,用来放转化后的txt文本
New_dir = os.path.abspath(os.path.join(path, 'txt'))
if not os.path.exists(New_dir):
os.mkdir(New_dir)
if debug:
print(New_dir)
#创建一个文本存入所有的word文件名
fileNameSet= os.path.abspath(os.path.join(New_dir, 'fileNames.txt'))
o=open(fileNameSet,"w")
try:
for filename in files:
if debug:
print(filename)
#如果不是word文件:继续
if not fnmatch.fnmatch(filename, '*.doc') and not fnmatch.fnmatch(filename, '*.docx'):
continue;
#如果是word临时文件:继续
if fnmatch.fnmatch(filename, '~$*'):
continue;
if debug:
print(filename)
docpath = os.path.abspath(os.path.join(path,filename))
#得到一个新的文件名,把原文件名的后缀改成txt
new_txt_name = ''
if fnmatch.fnmatch(filename, '*.doc'):
new_txt_name = filename[:-4]+'.txt'
else:
new_txt_name = filename[:-5]+'.txt'
if debug:
print(new_txt_name)
word_to_txt = os.path.join(os.path.join(path, 'txt'),new_txt_name)
print(word_to_txt)
wordapp = wc.Dispatch('Word.Application')
doc = wordapp.Documents.Open(docpath)
#为了让python可以在后续操作中r方式读取txt和不产生乱码,参数为4
doc.SaveAs(word_to_txt,4)
doc.Close()
o.write(word_to_txt+'\n')
all_FileNum += 1
finally:
wordapp.Quit()
if __name__ == '__main__':
print('''将一个目录下所有doc和docx文件转成txt
该目下创建一个新目录newdir
新目录下fileNames.txt创建一个文本存入所有的word文件名
本程序具有一定的容错性''')
print('Enter your Director\'s path:')
print("路径用\或\\表示均可")
mypath = 'G:\code_py\\rain\doc'
print ('生成的文件有:')
Translate(mypath)
print('The Total Files Numbers = ', all_FileNum)
原始文件路径为mypath,这段代码来自网上(目前我还不会写这样子的代码呀!!!!)
2、对txt文件进行处理
主要用了pandas,因为前两周都用过哈。
#-*- coding=utf-8 -*-
import pandas as pd
import numpy as np
import glob
import os
#获取省份
def getProvince():
os.chdir(r'G:\\code_py\\rain')
f= open('dic.txt',encoding='utf-8')
dicdata = pd.read_table(f,header = 0, delim_whitespace=True, index_col = 0, encoding='utf-8')
#print(dicdata)
provinces = np.unique(dicdata.values)
# a = idlist[0]
# #print(dicdata.loc[a].values)
return provinces
#通过省份获取站点
def statNum(province):
os.chdir(r'G:\\code_py\\rain')
f= open('dic.txt',encoding='utf-8')
dicdata = pd.read_table(f,header = 0, delim_whitespace=True, index_col = 1, encoding='utf-8')
statnum = dicdata.loc[province].values
return statnum.T[0]
#通过站点获取省份
def numProvince(num):
os.chdir(r'G:\\code_py\\rain')
f= open('dic.txt',encoding='utf-8')
dicdata = pd.read_table(f,header = 0, delim_whitespace=True, index_col = 0, encoding='utf-8')
province = dicdata.loc[num].values
return province[0]
#格式化指定站点号,月份的数据
def csvData(i,idloc):
os.chdir(r'G:\\code_py\\rain\\newdir')
lst = glob.glob("*.txt")
f= open(lst[i],encoding='utf-8')
csvdata = pd.read_table(f,header = None, delim_whitespace=True, usecols=[0, 4, 5, 6, 7,8],
index_col =[0,1,2,3], encoding='utf-8')
data = csvdata.loc[idloc].replace(32700,0)
colname = ['20-8时降水量','8-20时降水量']
myindex = ['年','月','日']
province = numProvince(idloc)
b = data.reset_index(drop = False)
c = pd.DataFrame(b.values, index = [province]*len(b.index), columns = (myindex+colname))
c.index.names = ['省份']
d = c.reset_index(drop = False)
d = d.set_index(['省份','年','月','日'])
return d
#创建指定省份月份的空DataFrame
def voidDf(i,province):
num = statNum(province)
if num.shape==():
idloc = num
else:
idloc = num[0]
dataindex = csvData(i,idloc)
lens = len(dataindex)
voidDfdata = pd.DataFrame(np.zeros([lens,2]), index = dataindex.index,
columns = ['20-8时降水量','8-20时降水量'])
return voidDfdata
def saveTxt(i,province):
os.chdir(r'G:\\code_py\\rain\\newdir')
lst = glob.glob("*.txt")
statnum = statNum(province)
if statnum.shape==():
idloc = statnum
data = csvData(0,idloc)
data = data.reset_index(drop = False)
os.chdir(r'G:\\code_py\\rain\\result')
data.to_csv('上海'+'.txt',mode='a',sep='\t',index=False,header = None,encoding='utf-8')
else:
voidDfdata = voidDf(i,province)
data1 = voidDfdata
for idloc in statnum:
try:
data2 = csvData(i,idloc)
data1 = data1 + data2
except:
pass
print(lst[i])
os.chdir(r'G:\\code_py\\rain\\result')
data = data1.reset_index(drop = False) #重置索引,并保留原索引
data.to_csv(str(province)+'.txt',mode='a',sep='\t',index=False,header = None,encoding='utf-8')
#特殊站点
def shanghai(i,idloc):
data = csvData(0,idloc)
data = data.reset_index(drop = False)
os.chdir(r'G:\\code_py\\rain\\result')
data.to_csv('上海'+'.txt',mode='w',sep='\t',index=False,header = None,encoding='utf-8')
#添加表头,以及将数据排序
def resultData(resultlst):
os.chdir(r'G:\\code_py\\rain\\result')
colname = ['20-8时降水量','8-20时降水量']
myindex = ['省份','年','月','日']
f= open(resultlst,encoding='utf-8')
resultdata = pd.read_table(f,header = None,delim_whitespace=True,encoding='utf-8')
resultdata.columns = myindex+colname
resultdata = resultdata.set_index(['省份','年','月','日'])
resultdata.sort_index(inplace=True)#按索引排序
print(resultdata)
os.chdir(r'G:\\code_py\\rain\\result\\result')
resultdata.to_csv(resultlst,mode='w',sep='\t',index=True,encoding='utf-8')
#各省份日降水量
provinces = getProvince()
os.chdir(r'G:\\code_py\\rain\\newdir')
lst = glob.glob("*.txt")
for province in provinces:
print(province)
for i in range(len(lst)):
saveTxt(i,province)
#上海重新写入(之前都是用追加,上海只有一个站点一个月的数据)
idloc = statNum('上海')
for i in range(len(lst)):
print(i)
shanghai(i,idloc)
#为生成的数据添加表头,排序
os.chdir(r'G:\\code_py\\rain\\result')
resultlsts = glob.glob("*.txt")
for resultlst in resultlsts:
resultData(resultlst)
各函数的功能,注释都有。
网友评论