组件安装
docx==0.2.4
python-docx==0.8.1
python-pptx==0.6.21
xlrd==2.0.1
xlrd==2.0.1
openpyxl==3.0.9
# -*- coding: utf-8 -*-
# @Date : 2022/3/31 15:42
# version: Python 3.8.*
# @File : FileRead.py
import docx
import subprocess
from pptx import Presentation
from xlrd import open_workbook
import openpyxl
def ReadDocx(filepath):
text = ''
doc = docx.Document(filepath) # Creating word reader object.
for para in doc.paragraphs:
text = text + para.text
return text
def ReadDoc(filepath):
output = subprocess.check_output(['antiword', filepath])
return output.decode("utf-8")
def ReadText(filepath):
f=open(filepath, mode='r',encoding="utf-8")
text=f.read()
# with open(filepath, "r", "utf-8") as f:
# ftext = f.read() # 一次性读全部
return text
def ReadText2(filepath):
with open(filepath, mode='r',encoding="utf-8") as f:
ftextlist = f.readlines()
return ftextlist
def ReadPptx(filepath):
text = ''
ppt = Presentation(filepath)
for slide in ppt.slides:
# print(slide)
for shape in slide.shapes:
if shape.has_text_frame:
text_frame = shape.text_frame
# print(text_frame.text)
text=text+text_frame.text
return text
def ReadXlsFile(filepath):
text=''
with open(filepath, 'rb') as f:
wb = open_workbook(filepath)
for s in wb.sheets():
# print('Sheet:', s.name)
text=text+s.name
for row in range(s.nrows):
print(row)
for col in range(s.ncols):
text+=s.cell(row, col).value;
return text
def ReadXlsxFile(filepath):
context=''
wb = openpyxl.load_workbook(filepath)
# 获取workbook中所有的表格
sheets = wb.sheetnames
# print(sheets)
# 循环遍历所有sheet
for i in range(len(sheets)):
sheet = wb[sheets[i]]
context=context+sheet.title
for r in range(1, sheet.max_row + 1):
if r == 1:
value=''.join([str(sheet.cell(row=r, column=c).value).ljust(17) for c in range(1, sheet.max_column + 1)])
context=context+ value
else:
value =''.join([str(sheet.cell(row=r, column=c).value).ljust(20) for c in range(1, sheet.max_column + 1)])
context = context + value
return context
网友评论