美文网首页
从CSDN搬家

从CSDN搬家

作者: xieyan0811 | 来源:发表于2021-12-18 14:00 被阅读0次

    说明

    上一次搬家是十一年前从MSN搬到CSDN,那时候MSN博客即将关闭。
    这一次是由CSDN搬到本地,改用Obsidian的Markdown管理文档。
    毕竟时代不同了,咱也得与时俱进,话不多说,上代码。

    技术

    具体用到了以下技术:

    • bs4:xml解析
    • html2text:从html中提取文本
    • urllib:下载网页

    代码

    本代码在2021-12-18日能正常使用,考虑到网站常常改版本,未来使用时可能需要做一些调整。代码可在Python3环境运行 。

    # coding=utf-8
    
    from bs4 import BeautifulSoup
    import urllib.request as request
    import codecs
    import re
    import os
    import html2text
    
    class Analyzer(object):
        def __init__(self):
            super(Analyzer, self).__init__()
        
        def get(self, url):
            headers = {'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
            req = request.Request(url, headers=headers)
            html_doc = request.urlopen(req).read()
            return html_doc
    
        def getContent(self, soup):
            return soup.find('body').find('main')
    
    class Exporter(Analyzer):
        def __init__(self):
            super(Exporter, self).__init__()
    
        def export(self, link, path):
            html_doc = self.get(link)
            soup = BeautifulSoup(html_doc)
            detail = self.getContent(soup)
            
            title = html2text.html2text(detail.find(class_='title-article').prettify())
            content = html2text.html2text(detail.find(class_='article_content').prettify())
            
            # 生成标签
            tags = html2text.html2text(detail.find(class_='blog-tags-box').prettify())
            tags = re.sub('[\n ]', '', tags)
            tags = re.findall(re.compile(r'[[](.*?)[]]', re.S), tags)        
            name = title.strip()
            name = re.sub('[# /]', '', name)
            date = html2text.html2text(detail.find(class_='time').prettify())
            date = date.strip()
            
            filename = os.path.join(path, f"{name}.md")
            f = codecs.open(filename, 'w', encoding='utf-8')
            
            # 生成元数据,以便后序索引
            info = f"""---
    title: {name}
    date: {date}
    tags: {tags}
    addr: {link}
    ---
    
    """     
            f.write(info)
            f.write(title)
            for tag in tags:
                f.write(f"#{tag} ")
            f.write("\n\n")
            f.write(content)
            f.close()
    
        def run(self, link, path):
            self.export(link, path)
    
    class Parser(Analyzer):
        def __init__(self):
            super(Parser, self).__init__()
            self.article_list = []
            self.page = -1
    
        def parse(self, html_doc):
            soup = BeautifulSoup(html_doc)
            res = self.getContent(soup).find(class_="article-list").find_all(class_='article-item-box')
            i = 0
            for ele in res:
                self.article_list.append(ele.h4.a['href'])
    
        def getAllArticleLink(self, url):
            self.page = 10 # 我的列表只有10页
            for i in range(4, self.page + 1): # for test
                print("work page", i, len(self.article_list))
                self.parse(self.get(url + '/article/list/' + str(i)))
    
        def export(self, path):
            print("article count", len(self.article_list))
            for link in self.article_list:
                print("link", link)
                exporter = Exporter()
                exporter.run(link, path)
    
        def run(self, url, path):
            self.page = -1
            self.article_list = []
            print("getting-link")
            self.getAllArticleLink(url)
            print("now export")
            self.export(path)
            print("finished")
    
    username = 'xxxxxx'
    url = 'http://blog.csdn.net/' + username
    parser = Parser()
    parser.run(url, 'tmp') 
    

    参考

    相关文章

      网友评论

          本文标题:从CSDN搬家

          本文链接:https://www.haomeiwen.com/subject/cdrgfrtx.html