美文网首页Pythoner集中营
python读取word文档识别字段颜色,解析字段

python读取word文档识别字段颜色,解析字段

作者: Xeroo | 来源:发表于2019-07-09 12:16 被阅读3次

python版本3.7.3,读取的文档格式为.docx

文中带有简单注释

看不懂的百度网盘下载直接查看,更改运行里面的py文件

import os
import sys
import xlrd
import codecs
import collections
import json
import io
import docx
import string
from docx import Document
from docx.shared import RGBColor #这个是docx的颜色类 

maxLength = 0
id = 1
convert_list = []
type_list = []
curPath = os.path.dirname(os.path.abspath(__file__))
# coding=utf-8
#获取文档对象
def readDocx(fileName,type):
    xlsFile = curPath + '\\'+fileName+'.docx'   #地理(葡)Respueda G .es.pt
    print("xlsFile: "+xlsFile)
    file=docx.Document(xlsFile)
    # print("段落数:"+str(len(file.paragraphs)))

    index = 0
    data = {}
    i = 0
    global id
    global maxLength
    for p in file.paragraphs:
        i = i + 1
        if i <= 1:  #跳过第一行
            continue
        if p.text == "" or (not p.text.strip()):
            continue
        # print("读取第 "+str(i)+" 行,文件名:"+fileName+" ID:"+str(id)+"  内容:"+p.text)
        if index == 0: #提取题目
            # print(p.text.find("-"),"题的内容是:", p.text)
            length = len(p.text)
            idx = p.text.find("Número")
            if idx != -1 and idx < 2:
                idx = idx + len("Número") + 1
                # print("Número: "+str(idx)+"   text: "+p.text)
                p.text = p.text[idx:(length)]
                # print("Número: "+str(idx)+"   text: "+p.text)
            
            indexStr = "-" #分隔符
            if p.text.find(indexStr) == -1:
                indexStr = "."
                if p.text.find(indexStr) == -1:
                    indexStr = " "
            # print("题的内容是:", p.text)
            idx = p.text.index(indexStr)+len(indexStr)
            length = len(p.text)
            if length > maxLength:
                maxLength = length
                # print(id,"最大字符数",maxLength)
            # print(str(idx)+str(length)+"第"+str(id)+"题的内容是:"+p.text)
            questionAndsubType = p.text[idx:(length)]
            questionAndsubTypeList = questionAndsubType.split("|")

            data["question"] = questionAndsubTypeList[0] #题目
            
            # if len(questionAndsubTypeList) > 1 : #类型
                # subType = questionAndsubTypeList[1].replace("\n", "")
                # print("---类型---",type_list.count(subType))
                # if type_list.count(subType) <= 0 :
                    # type_list.append(subType)

            data["subType"] = type#escape(subType)  #类型
        else:   #提取选项,以及正确答案
            # print("第"+str(id)+"题    选项"+ str(index) +"是:"+p.text)
            length = len(p.text)
            for n in p.runs:
                rgb = str(n.font.color.rgb) #读取段落颜色
                # print("runs"+rgb)
                if rgb == "00FF00":
                    # print("正确答案: ",index)
                    data["rightIndex"] = index
            #删除段落中不必要文字
            idx = p.text.find("(Direito)")
            if idx != -1:
                p.text = p.text[0:idx]
                
            idx = p.text.find("(Correcta)")
            if idx != -1:
                p.text = p.text[0:idx]
                
            idx = p.text.find("(Right)")
            if idx != -1:
                p.text = p.text[0:idx]
                
            idx = p.text.find("(Correct)")
            if idx != -1:
                p.text = p.text[0:idx]
            #删除段落中不必要文字
            
            data["option"+str(index)] = p.text

        index = index + 1

        if index >= 5:
            data["_id"] = id
            # print("data: "+str(data))
            convert_list.append(data)
            index = 0
            id = id + 1
            data = {}

def writeDocx(fileList,name):
    global id
    global convert_list
    global type_list
    id = 1
    convert_list = []
    type_list = []
    
    for p in fileList:
        readDocx(p["path"],p["type"])
    #题库
    jsonPath = os.path.join(curPath,"topic",name+".txt") #写入路径
    dirname = os.path.dirname(jsonPath)
    if not os.path.exists(dirname):
        os.makedirs(dirname)

    with io.open(jsonPath, 'w', encoding='utf-8') as f:     #按照对应路径写入
        f.write(json.dumps(convert_list, ensure_ascii=False, indent=4, sort_keys=True))

def main():
    en_fileList =  [{"path":"en_us_topic\\地理(英)Respueda G .es.en",          "type":"World"},
                    {"path":"en_us_topic\\科学与技术(英)",                    "type":"Technology"},
                    {"path":"en_us_topic\\历史(英)Resupeda H.es.en",           "type":"History"},
                    {"path":"en_us_topic\\艺术和文学(英)Respueda A&L.es.en",  "type":"ArtAndLiterature"},
                    {"path":"en_us_topic\\娱乐(英)Respueda E.es.en",           "type":"Fashion"},
                    {"path":"en_us_topic\\运动(英)Respueda  D.es.en",          "type":"Sports"}]
    en_name = "en_us_topic"

    es_fileList =  [{"path":"es_es_topic\\地理(西)Respueda G ",                "type":"World"},
                    {"path":"es_es_topic\\科学与技术(西)Respueda C&T",            "type":"Technology"},
                    {"path":"es_es_topic\\历史(西)Resupeda H",                 "type":"History"},
                    {"path":"es_es_topic\\艺术和文学(西)Respueda A&L",        "type":"ArtAndLiterature"},
                    {"path":"es_es_topic\\娱乐(西)Respueda E",                 "type":"Fashion"},
                    {"path":"es_es_topic\\运动(西)Respueda  D",                "type":"Sports"}]
    es_name = "es_es_topic"

    pt_fileList =  [{"path":"pt_br_topic\\地理(葡)Respueda G .es.pt",          "type":"World"},
                    {"path":"pt_br_topic\\科学与技术(葡)",                    "type":"Technology"},
                    {"path":"pt_br_topic\\历史(葡)Resupeda H.es.pt",           "type":"History"},
                    {"path":"pt_br_topic\\艺术和文学(葡)Respueda A&L.es.pt",  "type":"ArtAndLiterature"},
                    {"path":"pt_br_topic\\娱乐(葡)Respueda E.es.pt",           "type":"Fashion"},
                    {"path":"pt_br_topic\\运动(葡)Respueda  D.es.pt",          "type":"Sports"}]
    pt_name = "pt_br_topic"

    writeDocx(pt_fileList,pt_name)
    writeDocx(es_fileList,es_name)
    writeDocx(en_fileList,en_name)
    
main()

相关文章

网友评论

    本文标题:python读取word文档识别字段颜色,解析字段

    本文链接:https://www.haomeiwen.com/subject/fbqxkctx.html