python 批处理筛选world数据导入生成Excel

作者: 幸宇 | 来源:发表于2021-04-16 22:12 被阅读0次

python 批处理筛选world数据导入生成Excel
基于C++的灵活操作Excel导入导出数据
Excel的筛选你会，在Word内筛选数据你会吗？
HR必会的40个Excel逆天功能，动画教程珍藏版！
Excel简单动画
openpyxl各种操作汇总（1）—— 打开关闭工作簿、工作表
openpyxl操作Excel
Heidisql 导入 excel 数据
js-xlsx + handsontable + echarts
js文件上传、导入报表excel

最近公司有个很烦人的数据，需要手动复制world里的数据生成Excel表格，然后我们平台系统需要导入Excel的格式数据，学校给的world数据也是粘贴复制出的表格形式，一开始就想到了用pandas，但遇到以下两个问题：
1、在用pandas处理的时候像学生编号这种数据转换后是NaN，数据得不到很好的转换。
2、其次，学校给的数据解析出来太多重复，即使做了去重，得到的数据也无法按照world里的排序
所以就没用pandas筛选，最后还是选择简单的循环遍历，用到的库有docx和xlwt 还有 os 文件读取；
需求给的数据截图如下：

image.png

平台需要的Excel数据如下：

image.png

那么自己是怎么实现的，先说一下思路，很简单：
1、获取world table 里每个cell的内容；（大量重复数据，读出来就是如此，不知道这些数据是老师们粘贴复制的原因，还是从其他平台复制的原因）
2、筛选数据，去重复；
3、摘取所需要的字段
4、每个world 对应导出一样的Excel 名称

image.png

5、最后封装模块方法功能
6、执行主文件批量生成excel:

image.png

如下介绍各个模块的代码：
test2.py
封装文件路径函数返回文件夹下的所有docx文件路径和文件名

import os
# def getFilename():
#
#     arr = []
#     for root,dirs,files in os.walk('/Users/zhangxing/PycharmProjects/pythonProject/1'):
#         for file in files:
#             if os.path.splitext(file)[1] == '.docx':
#                 # print(os.path.join(root,file))
#                 arr.append((os.path.join(root,file)).split(root+'/')[1])
#
#     # print(arr)
#
#     filname = arr[0].split('.')[0]
#     return filname
#     # print(filname)
#
#
# def getFiles():
#     # print(os.walk('/Users/zhangxing/PycharmProjects/pythonProject/1'))
#     print(os.listdir('/Users/zhangxing/PycharmProjects/pythonProject/1'))
#     arr = []
#     for root,dirs,files in os.walk('/Users/zhangxing/PycharmProjects/pythonProject/1'):
#         for file in files:
#             if os.path.splitext(file)[1] == '.docx':
#                 print(os.path.join(root,file))
#                 return os.path.join(root,file)


# getFilename()
# getFiles()

# 返回文件夹下的所有docx文件路径和文件名
class DocFiles():
    def __init__(self,path):
        self.path = path
        self.pathArr = []
        self.target_path = path
    def getPath(self):
        for file_path in os.listdir(self.path):
            self.pathArr.append({'path':self.path + '/' + file_path,'name':file_path.split('.')[0]})
        return self.pathArr

    def targetPath(self):
        return self.target_path

main2.py 读取文件内容摘取所需要的内容并生成对应的sheet

import docx
from docx import Document
import xlwt
import os
import shutil
# from test2 import getFiles

def readFile(filpath,filename):

    # 新建excel表单
    workbook = xlwt.Workbook(encoding='utf-8')
    sheet = workbook.add_sheet("sheet1")

    # 取到文件夹里的文件路径和文件名
    docFile = filpath
    renameFile = filename

    document = Document(docFile)    #读入文件
    tables = document.tables        #获取文件中的表格集

    table = tables[0]
    row_count = len(table.rows)
    col_count = len(table.columns)


    my_list = []
    # 循环遍历world 里的表格每个单元格的内容放入列表
    for table in tables:
        for row in table.rows:
            for cell in row.cells:
                my_list.append(cell.text)

    # print(my_list)

    # 筛选删除重复数据
    list4=[]
    for i in my_list:
        if not i in list4:
            list4.append(i)

    # print(list4)


    idArr = []
    namesArry = []
    sexNum = []
    studyID =[]


    num_index = 1
    index = 1;
    sex_index = 1
    sty_index = 1

    sheet.write(0,0,'序号')
    sheet.write(0,1,'姓名')
    sheet.write(0,2,'性别')
    sheet.write(0,3,'学生编号')


    recoder = 0 #序号
    for item in list4:
        str = '姓名：'
        if(str in item):
            if(item[3:7]!=''):
                recoder+=1
                namesArry.append(item[3:7])
                idArr.append(recoder)

        str2 = '学籍号：'
        if (str2 in item):
            if(item.split('：')[1]!=''):
                studyID.append(item.split('：')[1])

        str3 = '编号'
        if(str3 in item):
            sex_str = item.split('：')[1]
            if(sex_str!=''):
                if('女' in sex_str):
                    sexNum.append('女')
                elif('男' in sex_str):
                    sexNum.append('男')
                else:sexNum.append('无性别')

    # 序号
    for num in idArr:
        sheet.write(num_index,0,num)
        num_index+=1

    # 姓名
    for name in namesArry:

        sheet.write(index, 1, name)
        index+=1

    # 性别
    if(len(sexNum)!=0):
        for sex in sexNum:
            sheet.write(sex_index,2,sex)
            sex_index+=1

    # 学籍号
    for study in studyID:

        sheet.write(sty_index,3,study)
        sty_index+=1

    file_name = renameFile+".xls"
    workbook.save(file_name)

    # temppath = os.getcwd()

    # 移动文件到指定路径
    # file_path = os.path.join(temppath,file_name)

    # shutil.move(file_path, targetfile)

    # print(file_path)
    # print(namesArry,idArr,studyID)

product.py 批量处理文件夹下的world 这里也是用的for... in

from main2 import readFile
from test2 import DocFiles
Allfiles = DocFiles('/Users/zhangxing/PycharmProjects/pythonProject/1')

# 获取所有的world文件路径
listFile = Allfiles.getPath()

targetPath = Allfiles.targetPath()
# print(targetPath)
# print(listFile[0]['path'])

# 循环遍历每个world文件
for docxFile in listFile:
    readFile(docxFile['path'],docxFile['name'])

git 地址：https://gitee.com/zx_top/python_word-excel
完毕