美文网首页Python
Python公众号爬虫备份

Python公众号爬虫备份

作者: elijah777 | 来源:发表于2021-06-26 11:37 被阅读0次

    公众号爬虫备份

    1、介绍与说明

    对于一些质量比较高的公众号,或者网页来说,一般会选择收藏,便于以后经常查看或者分享,但也会出现比较喜欢的文章被删之类的情况,便会觉得痛失心爱之物一样,如果提前做了爬虫备份便不会有这样的情况了

    技术实现原理时,先使用抓包工具获取文章数据集合的json串,再解析出json串中文章地址的URL,通过http请求获取文章内容,解析出文章中图片的地址并下载图片,替换response中的图片引入地址,保存网页到本地,使用wkhtmltopdf将网页转化为pdf保存。

    2、软件工具准备

    a、抓包工具 用户获取公众号文章json结果集合 https://www.charlesproxy.com/

    b、wkhtmltopdf 页面转pdf下载 https://wkhtmltopdf.org/

    c、pycharm之类的通用软件就不说了

    3、抓包获取json串

    电脑打开微信,通过公众号进入查看历史推送的问题

    image-20210626094144274.png image-20210626094533019.png image-20210626100740151.png

    刷新记录,并往下划以便查看更多的历史文章,使用charles查看浏览痕迹,filter到我们需要的信息通过mp.weixin.qq.com/mp/profile_ext?action=getms

    这样可以拿到请求的url集合

    image-20210626094726346.png image-20210626094812736.png image-20210626095850501.png

    将url地址复制到Chrome中,就可知,请求的url便可以获取到json串,json中的content_ur便是我们想要的文章链接。

    4、获取文章URL

    可以直接将request记录,复制的文本中,写个字符串处理的脚步,便可以解析出url,再发送请求获取json串中的content_url

    通过字符切割解析出url

    def str_to_json():
        urls = read_file(filepath)
        da = urls.split('\n')
        data = []
        for url in da:
            da = url.split('200')[0]
            data.append(da)
        return data
    
      #读取文件
    def read_file(filepath):
        with open(filepath, 'r', encoding='utf-8') as f:
            all_the_text = f.read()
        return all_the_text
    

    请求结果保存到本地文件, cookie信息自己保存

    
    def get_json(data):
        header = b'''
                    sec-ch-ua: " Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"
                    x-requested-with: XMLHttpRequest
                    sec-ch-ua-mobile: ?0
                    user-agent: 
                    sec-fetch-site: same-origin
                    sec-fetch-mode: cors
                    sec-fetch-dest: empty
                    accept-encoding: gzip, deflate, br
                    accept-language: zh-CN,zh;q=0.9
                    cookie: XXXX
                   '''
    
        # 转换成字典
        headers = headers_raw_to_dict(header)
        pattern = r"general_msg_list"
        num = 0
        for url in data:
            response = requests.get(url=url, headers=headers, verify=False)
            html = response.text
            result = re.search(pattern, html)
            print(html)
            print(result)
            if result != 'None':
                num = num + 1
                save_file(json_filepath + str(num) + '.json', html)
    
    # 保存文件
    def save_file(fpath, fileContent):
        with open(fpath, 'w', encoding='utf-8') as f:
            f.write(fileContent)
    
    

    需要有两个path常量

    filepath = '/Users/../file/str2json.txt'
    json_filepath = '/Users/tmp/data/wechat/book/json/'
    
    image-20210626101211691.png

    从json中解析url

    
    # 保存的json文件中提取文章url等信息
    def GetArticleList(jsondir):
        filelist = os.listdir(jsondir)
        ArtList = []
        pattern = r"unknown"
    
        for file in filelist:
            filepath = os.path.join(jsondir, file)
            if '/Users/xxx/json/.DS_Store' == str(filepath):
                continue
            filetxt = ReadFile(filepath)
            jsbody = json.loads(filetxt)
            result = re.search(pattern, str(jsbody))
            if result == 'None':
                continue
            try:
                general_msg_list = jsbody["general_msg_list"]
            except:
                print('error' + str(jsbody))
                general_msg_list = ''
                
            if general_msg_list == '':
                continue
            jsbd2 = json.loads(general_msg_list)
            list = jsbd2["list"]
            for item in list:  # 一个item里可能有多篇文章
                artidx = 1  # 请注意这里的编号只是为了保存html方便,并不对应于真实的文章发文位置(比如头条、次条、3条)
                comm_msg_info = item["comm_msg_info"]
    
                pubstamp = comm_msg_info["datetime"]
                pubdate = Timestamp2Datetime(pubstamp)
                if comm_msg_info["type"] == 49:  # 49为普通图文类型,还有其他类型,暂不考虑
                    app_msg_ext_info = item["app_msg_ext_info"]
                    url = app_msg_ext_info["content_url"]  # 文章链接
                    idx = artidx
                    title = app_msg_ext_info["title"]
                    art = Article(url, pubdate, idx, title)
                    if len(url) > 3:  # url不完整则跳过
                        ArtList.append(art)
                    print(len(ArtList), pubdate, idx, title)
                    if app_msg_ext_info["is_multi"] == 1:  # 一次发多篇
                        artidx += 1
                        multi_app_msg_item_list = app_msg_ext_info["multi_app_msg_item_list"]
                        for subArt in multi_app_msg_item_list:
                            url = subArt["content_url"]
                            idx = artidx
                            title = subArt["title"]
                            art = Article(url, pubdate, idx, title)
                            if len(url) > 3:
                                ArtList.append(art)
                            print(len(ArtList), pubdate, idx, title)
        return ArtList
    
      
    # 文章类 地址、发布时间、标题等
    class Article():
        def __init__(self, url, pubdate, idx, title):
            self.url = url
            self.pubdate = pubdate
            self.idx = idx
            self.title = title
            
    

    如果请求的url有控制的话,可能会产生异常,需要做判空或者异常处理,返回的ArtList,便是解析出文章的url

    5、获取文章content下载图片保存

    通过URL获取文章的内容,更改引入的外部了解,下载图片到本地

    # 下载url网页
    def DownLoadHtml(url):
        # 构造请求头
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Connection': 'keep-alive',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
        }
        requests.packages.urllib3.disable_warnings()
        response = requests.get(url, headers=headers, proxies=None, verify=False)
        if response.status_code == 200:
            htmltxt = response.text  # 返回的网页正文
            return htmltxt
        else:
            return None
    
    
    # 修改网页中图片的src,使图片能正常显示
    # htmltxt 网页文本 saveimgdir 保存目录 htmlname 网页名字
    
    def ChangeImgSrc(htmltxt, saveimgdir, htmlname):
        bs = BeautifulSoup(htmltxt, "lxml")  # 由网页源代码生成BeautifulSoup对象,第二个参数固定为lxml
        imgList = bs.findAll("img")
        imgindex = 0
        for img in imgList:
            imgindex += 1
            originalURL = ""  # 图片真实url
            if "data-src" in img.attrs:  # 有的<img 标签中可能没有data-src
                originalURL = img.attrs['data-src']
            elif "src" in img.attrs:  # 如果有src则提取出来
                originalURL = img.attrs['src']
            else:
                originalURL = ""
            if originalURL.startswith("//"):  # 如果url以//开头,则需要添加http:
                originalURL = "http:" + originalURL
            if len(originalURL) > 0:
                print("\r down imgs " + "▇" * imgindex + " " + str(imgindex), end="")
                if "data-type" in img.attrs:
                    imgtype = img.attrs["data-type"]
                else:
                    imgtype = "png"
                imgname = htmlname + "_" + str(imgindex) + "." + imgtype  # 形如 1.png的图片名
                imgsavepath = os.path.join(saveimgdir, imgname)  # 图片保存目录
                DownImg(originalURL, imgsavepath)
                img.attrs["src"] = "images/" + imgname  # 网页中图片的相对路径
            else:
                img.attrs["src"] = ""
        ChangeCssSrc(bs)  # 修改link标签
        ChangeContent(bs)  # 修改js_content的style,使正文能正常显示
        return str(bs)  # 将BeautifulSoup对象再转换为字符串,用于保存
    
      
    def ChangeCssSrc(bs):
        linkList = bs.findAll("link")
        for link in linkList:
            href = link.attrs["href"]
            if href.startswith("//"):
                newhref = "http:" + href
                link.attrs["href"] = newhref
    
    
    def ChangeContent(bs):
        jscontent = bs.find(id="js_content")
        if jscontent:
            jscontent.attrs["style"] = ""
        else:
            print("-----可能文章被删了-----")
    
     #   html下载到本地    
    def SaveFile(fpath, fileContent):
        with open(fpath, 'w', encoding='utf-8') as f:
            f.write(fileContent)
     
    

    6、html文件都转为pdf

    使用第三方工具wkhtmltopdf 转化

    # 把一个文件夹下的html文件都转为pdf
    def PDFDir(htmldir, pdfdir):
        if not os.path.exists(pdfdir):
            os.makedirs(pdfdir)
        flist = os.listdir(htmldir)
        for f in flist:
            if (not f[-5:] == ".html") or ("tmp" in f):  # 不是html文件的不转换,含有tmp的不转换
                continue
            htmlpath = htmldir + "/" + f
            tmppath = htmlpath[:-5] + "_tmp.html"  # 生成临时文件,供转pdf用
            htmlstr = ReadFile(htmlpath)
            bs = BeautifulSoup(htmlstr, "lxml")
            title = ""
            # pdf文件名中包含文章标题,但如果标题中有不能出现在文件名中的符号则会转换失败
            titleTag = bs.find(id="activity-name")
            if titleTag is not None:
                title = "_" + titleTag.get_text().replace(" ", "").replace("  ", "").replace("\n", "")
            ridx = htmlpath.rindex("/") + 1
            pdfname = htmlpath[ridx:-5] + title
            pdfpath = pdfdir + "/" + pdfname + ".pdf"
    
            """
                把js等去掉,减少转PDF时的加载项,
                注意此处去掉了css(link),如果发现pdf格式乱了可以不去掉css
            """
            [s.extract() for s in bs(["script", "iframe", "link"])]
            SaveFile(tmppath, str(bs))
            PDFOne(tmppath, pdfpath)
    
    # 把一个Html文件转为pdf
    def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
        if skipExists and os.path.exists(pdfpath):
            print("pdf exists", pdfpath)
            if removehtml:
                os.remove(htmlpath)
            return
        # --enable-local-file-access
        exepath = "wkhtmltopdf" 
        cmdlist = []
        cmdlist.append("  --enable-local-file-access ")
        # cmdlist.append(" --load-error-handling ignore ")
        # cmdlist.append(" --page-height 200 ") #数字可以自己调节,也可以不加这两行
        # cmdlist.append(" --page-width 140 ")
        cmdlist.append(" " + htmlpath + " ")
        cmdlist.append(" " + pdfpath + " ")
        cmdstr = exepath + "".join(cmdlist)
        print(cmdstr)
        os.system(cmdstr)
            
    

    7、其他说明

    导入到包有

    import os, sys
    import requests
    import json
    from bs4 import BeautifulSoup
    from datetime import datetime, timedelta
    from time import sleep
    import re
    

    我使用的Mac,window系统抓包工具可使用其他 Fiddler

    参考内容

    https://github.com/LeLe86/vWeChatCrawl

    一步步教你打造文章爬虫(2)-下载网页

    一步步教你打造文章爬虫(3)-批量下载

    2021/06/26 于成都

    相关文章

      网友评论

        本文标题:Python公众号爬虫备份

        本文链接:https://www.haomeiwen.com/subject/vdfzyltx.html