美文网首页Python练习
Python 文件内容读取

Python 文件内容读取

作者: 过桥 | 来源:发表于2019-03-12 15:10 被阅读0次

    背景

    计划实现文件中心,支撑检索常见文件内容

    依赖包

    #pip install baidu-aip
    from aip import AipOcr
    #pip install xlrd
    import xlrd
    import os
    #pip install csv23
    import csv23
    #pip install docx2txt
    import docx2txt
    #pip install pypiwin32
    from win32com import client as wc
    #pip install python-pptx
    from pptx import Presentation
    #pip install wand
    from wand.image import Image
    # 使用 wand 异常,缺少 ImageMagick 支持
    # http://docs.wand-py.org/en/latest/guide/install.html#install-imagemagick-on-windows
    # https://imagemagick.org/script/download.php#windows
    # 使用 wand 异常,FailedToExecuteCommand `"gswin32c.exe"
    # http://ghostscript.com/download/gsdnld.html
    

    基础文件读取

    .txt

    # 读取 txt 文件,返回文件内容 
    def readTxt(fileUrl):
        content = ""
        if os.path.exists(fileUrl):
            with open(fileUrl, 'r') as f:
                for l in f:
                    temp = l.rstrip('\n').rstrip().split('\t')[0]
                    content += temp.replace(' ','')
        return content
    

    .xls .xlsx

    # 读取 excel 文件,返回文件内容 
    def readExcel(fileUrl):
        content = ""
        if os.path.exists(fileUrl):
            excelfile = xlrd.open_workbook(fileUrl)
            for name in excelfile.sheet_names():
                sheet = excelfile.sheet_by_name(name)
                sheet_rows = sheet.nrows
                sheet_cols = sheet.ncols
                for rowi in range(sheet_rows):
                    temp = sheet.row_values(rowi)
                    content += (''.join(map(str,temp))).replace(' ','')
        return content
    

    .docx

    # 读取 docx 文件    
    def readDocx(fileUrl):
        content = ""
        if os.path.exists(fileUrl):
            content = docx2txt.process(fileUrl)
            content = "".join(content.split())
        return content
    

    .doc

    # 读取 doc 文件,安装 pypiwin32,操作本地word程序,将doc 转为docx,再调用读取 docx 文件方法
    def readDoc(fileUrl):
        AbsolutePath = os.path.abspath(fileUrl)
        word = wc.Dispatch('Word.Application')
        doc = word.Documents.Open(AbsolutePath)
        # 保存临时文件
        doc.SaveAs(AbsolutePath + ".docx", 12, False, "", True, "", False, False, False, False) # 转化后路径下的文件 
        doc.Close()
        word.Quit()
        content = readDocx(fileUrl + ".docx")
        # 移除临时文件
        os.remove(fileUrl + ".docx")
        return content
    
    

    其他文件读取

    .csv

    # 读取 csv 文件,返回文件内容,默认utf-8,如果解析不了,使用gbk解析
    def readCsv(fileUrl):
        content = ""
        if os.path.exists(fileUrl):
            try:
                with csv23.open_csv(fileUrl) as reader:
                    for row in reader:
                        content += (''.join(row)).replace(' ','')
            except Exception as e:
                with csv23.open_csv(fileUrl, encoding='gbk') as reader:
                    for row in reader:
                        content += (''.join(row)).replace(' ','')
        return content
    

    图片

    # 读取 图片 文件,返回文件内容 
    def readImage(fileUrl):
        content = ""
        if os.path.exists(fileUrl):
            APP_ID = 'xxxxx'
            API_KEY = 'xxxxxxxxxxxxxxxx'
            SECRET_KEY = 'xxxxxxxxxxxxxxxxxxxxxxx'
            client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
            with open(fileUrl,'rb') as f:
                img = f.read()
                msg = client.basicGeneral(img)
                for i in msg.get('words_result'):
                    temp = i.get('words')
                    content += temp.replace(' ','')
        return content
    

    .pptx

    # 读取 pptx 文件 ,默认读取正文,默认不读取表格,默认不读取图片
    def readPptx(fileUrl,extend_table = False,extend_image = False):
        content = ""
        ppt = Presentation(fileUrl)
    
        for slide in ppt.slides:
            for shape in slide.shapes:
                if not shape.has_text_frame:
                    # 提取图片文字
                    if extend_image and hasattr(shape,'image'):
                        # 图片存储本地
                        with open(shape.image.filename, 'wb') as f:
                            f.write(shape.image.blob)
                            f.close()
                        # 调用图片文字识别
                        content += readImage(shape.image.filename)
                        # 移除临时图片
                        os.remove(shape.image.filename)
                    # 提取表格内容
                    if extend_table and shape.has_table:
                        for row in shape.table.rows:
                            for cell in row.cells:
                                content += cell.text
                else:
                    content += shape.text
    
        content = "".join(content.split())
        return content
    
    

    .ppt

    # 读取 ppt 文件,安装 pypiwin32,操作本地ppt程序,将ppt 转为pptx,再调用读取 pptx 文件方法
    def readPpt(fileUrl,extend_table = False,extend_image = False):
        AbsolutePath = os.path.abspath(fileUrl)
        powerpoint = wc.Dispatch('PowerPoint.Application')
        ppt = powerpoint.Presentations.Open(AbsolutePath)
        # 保存临时文件
        ppt.SaveAs(AbsolutePath + ".pptx") 
        powerpoint.Quit()
        content = readPptx(fileUrl + ".pptx",extend_table,extend_image)
        # 移除临时文件
        os.remove(fileUrl + ".pptx")
        return content
    
    

    .pdf

    # 读取 pdf 文件
    def readPdf(fileUrl):
        content = ""
        # 将pdf文件转为jpg图片文件
        # ./PDF_FILE_NAME 为pdf文件路径和名称
        image_pdf = Image(filename=fileUrl,resolution=300)    
        image_jpeg = image_pdf.convert('jpg')
             
        # wand已经将PDF中所有的独立页面都转成了独立的二进制图像对象。我们可以遍历这个大对象,并把它们加入到req_image序列中去。    
        req_image = []
        for img in image_jpeg.sequence:
            img_page = Image(image=img)
            req_image.append(img_page.make_blob('jpg'))
         
        # 遍历req_image,保存为图片文件
    
        for img in req_image:
            ff = open(fileUrl+'.jpg','wb')
            ff.write(img)
            ff.close()
            # 调用图片文字识别
            content += readImage(fileUrl+'.jpg')
            # 移除临时图片
            os.remove(fileUrl+'.jpg')
        return content
    
    

    相关文章

      网友评论

        本文标题:Python 文件内容读取

        本文链接:https://www.haomeiwen.com/subject/bfqnpqtx.html