Python读取word文档识别字段颜色，解析字段！

作者: 14e61d025165 | 来源:发表于2019-07-10 15:20 被阅读2次

python读取word文档识别字段颜色，解析字段
Python读取word文档识别字段颜色，解析字段！
用python·批量生成含变量字段的excel文档
MongoDB Aggregation聚合操作之$unwind
用到过得php函数
python读文件，存excel
elasticsearch之五document详解
Java Excel (Apache POI + annotat
查询API
Opencsv将csv文件转成Bean

python版本3.7.3，读取的文档格式为.docx

文中带有简单注释

看不懂的百度网盘下载直接查看，更改运行里面的py文件

网盘下载 Python学习交流群：1004391443
提取码：nngw
import os
import sys
import xlrd
import codecs
import collections
import json
import io
import docx
import string
from docx import Document
from docx.shared import RGBColor #这个是docx的颜色类
maxLength = 0
id = 1
convert_list = []
type_list = []
curPath = os.path.dirname(os.path.abspath(file))

coding=utf-8

获取文档对象

def readDocx(fileName,type):
xlsFile = curPath + '\'+fileName+'.docx' #地理(葡）Respueda G .es.pt
print("xlsFile: "+xlsFile)
file=docx.Document(xlsFile)
# print("段落数:"+str(len(file.paragraphs)))
index = 0
data = {}
i = 0
global id
global maxLength
for p in file.paragraphs:
i = i + 1
if i <= 1: #跳过第一行
continue
if p.text == "" or (not p.text.strip()):
continue
# print("读取第 "+str(i)+" 行，文件名："+fileName+" ID："+str(id)+" 内容:"+p.text)
if index == 0: #提取题目
# print(p.text.find("-"),"题的内容是：", p.text)
length = len(p.text)
idx = p.text.find("Número")
if idx != -1 and idx < 2:
idx = idx + len("Número") + 1
# print("Número: "+str(idx)+" text: "+p.text)
p.text = p.text[idx:(length)]
# print("Número: "+str(idx)+" text: "+p.text)

        indexStr = "-" #分隔符
        if p.text.find(indexStr) == -1:
            indexStr = "."
            if p.text.find(indexStr) == -1:
                indexStr = " "
        # print("题的内容是：", p.text)
        idx = p.text.index(indexStr)+len(indexStr)
        length = len(p.text)
        if length > maxLength:
            maxLength = length
            # print(id,"最大字符数",maxLength)
        # print(str(idx)+str(length)+"第"+str(id)+"题的内容是："+p.text)
        questionAndsubType = p.text[idx:(length)]
        questionAndsubTypeList = questionAndsubType.split("|")
        data["question"] = questionAndsubTypeList[0] #题目
        
        # if len(questionAndsubTypeList) > 1 : #类型
            # subType = questionAndsubTypeList[1].replace("\n", "")
            # print("---类型---",type_list.count(subType))
            # if type_list.count(subType) <= 0 :
                # type_list.append(subType)
        data["subType"] = type#escape(subType)  #类型
    else:   #提取选项，以及正确答案
        # print("第"+str(id)+"题    选项"+ str(index) +"是："+p.text)
        length = len(p.text)
        for n in p.runs:
            rgb = str(n.font.color.rgb) #读取段落颜色
            # print("runs"+rgb)
            if rgb == "00FF00":
                # print("正确答案： ",index)
                data["rightIndex"] = index
        #删除段落中不必要文字
        idx = p.text.find("(Direito)")
        if idx != -1:
            p.text = p.text[0:idx]
            
        idx = p.text.find("(Correcta)")
        if idx != -1:
            p.text = p.text[0:idx]
            
        idx = p.text.find("(Right)")
        if idx != -1:
            p.text = p.text[0:idx]
            
        idx = p.text.find("(Correct)")
        if idx != -1:
            p.text = p.text[0:idx]
        #删除段落中不必要文字
        
        data["option"+str(index)] = p.text
    index = index + 1
    if index >= 5:
        data["_id"] = id
        # print("data: "+str(data))
        convert_list.append(data)
        index = 0
        id = id + 1
        data = {}

def writeDocx(fileList,name):
global id
global convert_list
global type_list
id = 1
convert_list = []
type_list = []

for p in fileList:
    readDocx(p["path"],p["type"])
#题库
jsonPath = os.path.join(curPath,"topic",name+".txt") #写入路径
dirname = os.path.dirname(jsonPath)
if not os.path.exists(dirname):
    os.makedirs(dirname)
with io.open(jsonPath, 'w', encoding='utf-8') as f:     #按照对应路径写入
    f.write(json.dumps(convert_list, ensure_ascii=False, indent=4, sort_keys=True))

def main():
en_fileList = [{"path":"en_us_topic\地理(英）Respueda G .es.en", "type":"World"},
{"path":"en_us_topic\科学与技术（英）", "type":"Technology"},
{"path":"en_us_topic\历史（英)Resupeda H.es.en", "type":"History"},
{"path":"en_us_topic\艺术和文学（英）Respueda A&L.es.en", "type":"ArtAndLiterature"},
{"path":"en_us_topic\娱乐（英）Respueda E.es.en", "type":"Fashion"},
{"path":"en_us_topic\运动（英)Respueda D.es.en", "type":"Sports"}]
en_name = "en_us_topic"
es_fileList = [{"path":"es_es_topic\地理(西）Respueda G ", "type":"World"},
{"path":"es_es_topic\科学与技术(西）Respueda C&T", "type":"Technology"},
{"path":"es_es_topic\历史（西)Resupeda H", "type":"History"},
{"path":"es_es_topic\艺术和文学（西）Respueda A&L", "type":"ArtAndLiterature"},
{"path":"es_es_topic\娱乐（西）Respueda E", "type":"Fashion"},
{"path":"es_es_topic\运动（西)Respueda D", "type":"Sports"}]
es_name = "es_es_topic"
pt_fileList = [{"path":"pt_br_topic\地理(葡）Respueda G .es.pt", "type":"World"},
{"path":"pt_br_topic\科学与技术（葡）", "type":"Technology"},
{"path":"pt_br_topic\历史（葡)Resupeda H.es.pt", "type":"History"},
{"path":"pt_br_topic\艺术和文学（葡）Respueda A&L.es.pt", "type":"ArtAndLiterature"},
{"path":"pt_br_topic\娱乐（葡）Respueda E.es.pt", "type":"Fashion"},
{"path":"pt_br_topic\运动（葡)Respueda D.es.pt", "type":"Sports"}]
pt_name = "pt_br_topic"
writeDocx(pt_fileList,pt_name)
writeDocx(es_fileList,es_name)
writeDocx(en_fileList,en_name)