美文网首页大数据 爬虫Python AI SqlPython小哥哥
Python读取word文档识别字段颜色,解析字段!

Python读取word文档识别字段颜色,解析字段!

作者: 14e61d025165 | 来源:发表于2019-07-10 15:20 被阅读2次

python版本3.7.3,读取的文档格式为.docx

文中带有简单注释

看不懂的百度网盘下载直接查看,更改运行里面的py文件

网盘下载 Python学习交流群:1004391443
提取码:nngw
import os
import sys
import xlrd
import codecs
import collections
import json
import io
import docx
import string
from docx import Document
from docx.shared import RGBColor #这个是docx的颜色类
maxLength = 0
id = 1
convert_list = []
type_list = []
curPath = os.path.dirname(os.path.abspath(file))

coding=utf-8

获取文档对象

def readDocx(fileName,type):
xlsFile = curPath + '\'+fileName+'.docx' #地理(葡)Respueda G .es.pt
print("xlsFile: "+xlsFile)
file=docx.Document(xlsFile)
# print("段落数:"+str(len(file.paragraphs)))
index = 0
data = {}
i = 0
global id
global maxLength
for p in file.paragraphs:
i = i + 1
if i <= 1: #跳过第一行
continue
if p.text == "" or (not p.text.strip()):
continue
# print("读取第 "+str(i)+" 行,文件名:"+fileName+" ID:"+str(id)+" 内容:"+p.text)
if index == 0: #提取题目
# print(p.text.find("-"),"题的内容是:", p.text)
length = len(p.text)
idx = p.text.find("Número")
if idx != -1 and idx < 2:
idx = idx + len("Número") + 1
# print("Número: "+str(idx)+" text: "+p.text)
p.text = p.text[idx:(length)]
# print("Número: "+str(idx)+" text: "+p.text)

        indexStr = "-" #分隔符
        if p.text.find(indexStr) == -1:
            indexStr = "."
            if p.text.find(indexStr) == -1:
                indexStr = " "
        # print("题的内容是:", p.text)
        idx = p.text.index(indexStr)+len(indexStr)
        length = len(p.text)
        if length > maxLength:
            maxLength = length
            # print(id,"最大字符数",maxLength)
        # print(str(idx)+str(length)+"第"+str(id)+"题的内容是:"+p.text)
        questionAndsubType = p.text[idx:(length)]
        questionAndsubTypeList = questionAndsubType.split("|")
        data["question"] = questionAndsubTypeList[0] #题目
        
        # if len(questionAndsubTypeList) > 1 : #类型
            # subType = questionAndsubTypeList[1].replace("\n", "")
            # print("---类型---",type_list.count(subType))
            # if type_list.count(subType) <= 0 :
                # type_list.append(subType)
        data["subType"] = type#escape(subType)  #类型
    else:   #提取选项,以及正确答案
        # print("第"+str(id)+"题    选项"+ str(index) +"是:"+p.text)
        length = len(p.text)
        for n in p.runs:
            rgb = str(n.font.color.rgb) #读取段落颜色
            # print("runs"+rgb)
            if rgb == "00FF00":
                # print("正确答案: ",index)
                data["rightIndex"] = index
        #删除段落中不必要文字
        idx = p.text.find("(Direito)")
        if idx != -1:
            p.text = p.text[0:idx]
            
        idx = p.text.find("(Correcta)")
        if idx != -1:
            p.text = p.text[0:idx]
            
        idx = p.text.find("(Right)")
        if idx != -1:
            p.text = p.text[0:idx]
            
        idx = p.text.find("(Correct)")
        if idx != -1:
            p.text = p.text[0:idx]
        #删除段落中不必要文字
        
        data["option"+str(index)] = p.text
    index = index + 1
    if index >= 5:
        data["_id"] = id
        # print("data: "+str(data))
        convert_list.append(data)
        index = 0
        id = id + 1
        data = {}

def writeDocx(fileList,name):
global id
global convert_list
global type_list
id = 1
convert_list = []
type_list = []

for p in fileList:
    readDocx(p["path"],p["type"])
#题库
jsonPath = os.path.join(curPath,"topic",name+".txt") #写入路径
dirname = os.path.dirname(jsonPath)
if not os.path.exists(dirname):
    os.makedirs(dirname)
with io.open(jsonPath, 'w', encoding='utf-8') as f:     #按照对应路径写入
    f.write(json.dumps(convert_list, ensure_ascii=False, indent=4, sort_keys=True))

def main():
en_fileList = [{"path":"en_us_topic\地理(英)Respueda G .es.en", "type":"World"},
{"path":"en_us_topic\科学与技术(英)", "type":"Technology"},
{"path":"en_us_topic\历史(英)Resupeda H.es.en", "type":"History"},
{"path":"en_us_topic\艺术和文学(英)Respueda A&L.es.en", "type":"ArtAndLiterature"},
{"path":"en_us_topic\娱乐(英)Respueda E.es.en", "type":"Fashion"},
{"path":"en_us_topic\运动(英)Respueda D.es.en", "type":"Sports"}]
en_name = "en_us_topic"
es_fileList = [{"path":"es_es_topic\地理(西)Respueda G ", "type":"World"},
{"path":"es_es_topic\科学与技术(西)Respueda C&T", "type":"Technology"},
{"path":"es_es_topic\历史(西)Resupeda H", "type":"History"},
{"path":"es_es_topic\艺术和文学(西)Respueda A&L", "type":"ArtAndLiterature"},
{"path":"es_es_topic\娱乐(西)Respueda E", "type":"Fashion"},
{"path":"es_es_topic\运动(西)Respueda D", "type":"Sports"}]
es_name = "es_es_topic"
pt_fileList = [{"path":"pt_br_topic\地理(葡)Respueda G .es.pt", "type":"World"},
{"path":"pt_br_topic\科学与技术(葡)", "type":"Technology"},
{"path":"pt_br_topic\历史(葡)Resupeda H.es.pt", "type":"History"},
{"path":"pt_br_topic\艺术和文学(葡)Respueda A&L.es.pt", "type":"ArtAndLiterature"},
{"path":"pt_br_topic\娱乐(葡)Respueda E.es.pt", "type":"Fashion"},
{"path":"pt_br_topic\运动(葡)Respueda D.es.pt", "type":"Sports"}]
pt_name = "pt_br_topic"
writeDocx(pt_fileList,pt_name)
writeDocx(es_fileList,es_name)
writeDocx(en_fileList,en_name)

main()

相关文章

网友评论

    本文标题:Python读取word文档识别字段颜色,解析字段!

    本文链接:https://www.haomeiwen.com/subject/qiehkctx.html