美文网首页
利用python抓取pdf单页面合成文件 2019-08-20

利用python抓取pdf单页面合成文件 2019-08-20

作者: 浮土_151d | 来源:发表于2019-08-20 11:24 被阅读0次

    注释,已失效。

    import os
    import requests
    import PyPDF2

    make the dir

    print(os.getcwd())
    print("Please input the day like yyyymmdd:")
    date=input()
    dir=os.getcwd()+'\'+date

    if(not os.path.exists(dir)):
    os.makedirs(dir)
    os.chdir(dir)

    download the page from A001 to M020############

    def downpdf(url1,page1):
    res=requests.get(url1)
    if res.status_code ==200:
    with open(page1+".pdf",'wb') as f:
    f.write(res.content)
    else:
    return(False)
    print(page1+".pdf has been download!")
    return(True)

    check the page

    page=['A001', 'A002', 'A003', 'A004', 'A005', 'A006', 'A007', 'A008', 'A009', 'A010', 'A011', 'A012', 'A013', 'A014', 'A015', 'A016', 'A017', 'A018', 'A019', 'A020', 'B001', 'B002', 'B003', 'B004', 'B005', 'B006', 'B007', 'B008', 'B009', 'B010', 'B011', 'B012', 'B013', 'B014', 'B015', 'B016', 'B017', 'B018', 'B019', 'B020', 'C001', 'C002', 'C003', 'C004', 'C005', 'C006', 'C007', 'C008', 'C009', 'C010', 'C011', 'C012', 'C013', 'C014', 'C015', 'C016', 'C017', 'C018', 'C019', 'C020', 'D001', 'D002', 'D003', 'D004', 'D005', 'D006', 'D007', 'D008', 'D009', 'D010', 'D011', 'D012', 'D013', 'D014', 'D015', 'D016', 'D017', 'D018', 'D019', 'D020', 'M001', 'M002', 'M003', 'M004', 'M005', 'M006', 'M007', 'M008', 'M009', 'M010', 'M011', 'M012', 'M013', 'M014', 'M015', 'M016', 'M017', 'M018', 'M019', 'M020']
    for i in page:
    url='http://online.wsj.com/public/resources/documents/print/WSJ_-'+i+'-'+date+'.pdf'
    downpdf(url,i)

    com='pdftk f:\py\'+date+'\*.pdf cat output f:\py\WSJ'+date+'.pdf'

    a=os.system(com)
    if a==0:
    print("the pdf has been combined!")

    if os.system('rmdir /s /q f:\py\'+date)==0:
    print("the folden has been delet!")

    http://online.wsj.com/public/resources/documents/print/WSJ_-A002-20180906.pdf

    相关文章

      网友评论

          本文标题:利用python抓取pdf单页面合成文件 2019-08-20

          本文链接:https://www.haomeiwen.com/subject/gxwusctx.html