美文网首页
用python合并多个pdf文件并标页码

用python合并多个pdf文件并标页码

作者: 习惯了千姿百态 | 来源:发表于2019-11-25 21:40 被阅读0次

    合并多个pdf文件

    来源某篇博客,忘了地址=_=!

    # -*- coding:utf-8*-
    # 利用PyPDF2模块合并同一文件夹下的所有PDF文件
    # 只需修改存放PDF文件的文件夹变量:file_dir 和 输出文件名变量: outfile
    
    import os
    from PyPDF2 import PdfFileReader, PdfFileWriter
    import time
    
    # 使用os模块的walk函数,搜索出指定目录下的全部PDF文件
    # 获取同一目录下的所有PDF文件的绝对路径
    def getFileName(filedir):
    
        file_list = [os.path.join(root, filespath) \
                     for root, dirs, files in os.walk(filedir) \
                     for filespath in files \
                     if str(filespath).endswith('pdf')
                     ]
        return file_list if file_list else []
    
    # 合并同一目录下的所有PDF文件
    def MergePDF(filepath, outfile):
    
        output = PdfFileWriter()
        outputPages = 0
        pdf_fileName = getFileName(filepath)
    
        if pdf_fileName:
            for pdf_file in pdf_fileName:
                print("路径:%s"%pdf_file)
    
                # 读取源PDF文件
                input = PdfFileReader(open(pdf_file, "rb"))
    
                # 获得源PDF文件中页面总数
                pageCount = input.getNumPages()
                outputPages += pageCount
                print("页数:%d"%pageCount)
    
                # 分别将page添加到输出output中
                for iPage in range(pageCount):
                    output.addPage(input.getPage(iPage))
    
            print("合并后的总页数:%d."%outputPages)
            # 写入到目标PDF文件
            outputStream = open(os.path.join(filepath, outfile), "wb")
            output.write(outputStream)
            outputStream.close()
            print("PDF文件合并完成!")
    
        else:
            print("没有可以合并的PDF文件!")
    
    # 主函数
    def main():
        time1 = time.time()
        file_dir = r'E:\test\ac3' # 存放PDF的原文件夹
        outfile = "Cheat_Sheets.pdf" # 输出的PDF文件的名称
        MergePDF(file_dir, outfile)
        time2 = time.time()
        print('总共耗时:%s s.' %(time2 - time1))
    
    main()
    

    可能会报错,注释site-packages/PyPDF2/generic.py下的这段代码:

    标页码

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    helpDoc = '''
    Add Page Number to PDF file with Python
    Python 给 PDF 添加 页码
    usage:
        python addPageNumberToPDF.py [PDF path]
    require:
        pip install reportlab pypdf2
        Support both Python2/3, But more recommend Python3
    
    tips:
        * output file will save at pdfWithNumbers/[PDF path]_page.pdf
        * only support A4 size PDF
        * tested on Python2/Python3@ubuntu
        * more large size of PDF require more RAM
        * if segmentation fault, plaese try use Python 3
        * if generate PDF document is damaged, plaese try use Python 3
    
    Author:
        Lei Yang (ylxx@live.com)
    
    GitHub:
        https://gist.github.com/DIYer22/b9ede6b5b96109788a47973649645c1f
    '''
    print(helpDoc)
    
    import reportlab
    from reportlab.lib.units import mm
    from reportlab.pdfgen import canvas
    
    from PyPDF2 import PdfFileWriter, PdfFileReader
    
    def createPagePdf(num, tmp):
        c = canvas.Canvas(tmp)
        for i in range(1,num+1):
            c.drawString((210//2)*mm, (4)*mm, str(i))
            c.showPage()
        c.save()
        return
        with open(tmp, 'rb') as f:
            pdf = PdfFileReader(f)
            layer = pdf.getPage(0)
        return layer
    
    
    if __name__ == "__main__":
        pass
        import sys,os
    # 需要标页码的pdf文件
        path = 'E:\\test\\ac2\\3.pdf'
        if len(sys.argv) == 1:
            if not os.path.isfile(path):
                sys.exit(1)
        else:
            path = sys.argv[1]
        base = os.path.basename(path)
    
    
        tmp = "__tmp.pdf"
    
        batch = 10
        batch = 0
        output = PdfFileWriter()
        with open(path, 'rb') as f:
            pdf = PdfFileReader(f,strict=False)
            n = pdf.getNumPages()
            if batch == 0:
                batch = -n
            createPagePdf(n,tmp)
            if not os.path.isdir('pdfWithNumbers/'):
                os.mkdir('pdfWithNumbers/')
            with open(tmp, 'rb') as ftmp:
                numberPdf = PdfFileReader(ftmp)
                for p in range(n):
                    if not p%batch and p:
                        newpath = path.replace(base, 'pdfWithNumbers/'+ base[:-4] + '_page_%d'%(p//batch) + path[-4:])
                        with open(newpath, 'wb') as f:
                            output.write(f)
                        output = PdfFileWriter()
                    print('page: %d of %d'%(p, n))
                    page = pdf.getPage(p)
                    numberLayer = numberPdf.getPage(p)
    
                    page.mergePage(numberLayer)
                    output.addPage(page)
                if output.getNumPages():
                    newpath = path.replace(base,  base[:-4] + '_page_%d'%(p//batch + 1)  + path[-4:])
                    with open(newpath, 'wb') as f:
                        output.write(f)
    
            os.remove(tmp)
    

    相关文章

      网友评论

          本文标题:用python合并多个pdf文件并标页码

          本文链接:https://www.haomeiwen.com/subject/rfhswctx.html