说明
上一次搬家是十一年前从MSN搬到CSDN,那时候MSN博客即将关闭。
这一次是由CSDN搬到本地,改用Obsidian的Markdown管理文档。
毕竟时代不同了,咱也得与时俱进,话不多说,上代码。
技术
具体用到了以下技术:
- bs4:xml解析
- html2text:从html中提取文本
- urllib:下载网页
代码
本代码在2021-12-18日能正常使用,考虑到网站常常改版本,未来使用时可能需要做一些调整。代码可在Python3环境运行 。
# coding=utf-8
from bs4 import BeautifulSoup
import urllib.request as request
import codecs
import re
import os
import html2text
class Analyzer(object):
def __init__(self):
super(Analyzer, self).__init__()
def get(self, url):
headers = {'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
req = request.Request(url, headers=headers)
html_doc = request.urlopen(req).read()
return html_doc
def getContent(self, soup):
return soup.find('body').find('main')
class Exporter(Analyzer):
def __init__(self):
super(Exporter, self).__init__()
def export(self, link, path):
html_doc = self.get(link)
soup = BeautifulSoup(html_doc)
detail = self.getContent(soup)
title = html2text.html2text(detail.find(class_='title-article').prettify())
content = html2text.html2text(detail.find(class_='article_content').prettify())
# 生成标签
tags = html2text.html2text(detail.find(class_='blog-tags-box').prettify())
tags = re.sub('[\n ]', '', tags)
tags = re.findall(re.compile(r'[[](.*?)[]]', re.S), tags)
name = title.strip()
name = re.sub('[# /]', '', name)
date = html2text.html2text(detail.find(class_='time').prettify())
date = date.strip()
filename = os.path.join(path, f"{name}.md")
f = codecs.open(filename, 'w', encoding='utf-8')
# 生成元数据,以便后序索引
info = f"""---
title: {name}
date: {date}
tags: {tags}
addr: {link}
---
"""
f.write(info)
f.write(title)
for tag in tags:
f.write(f"#{tag} ")
f.write("\n\n")
f.write(content)
f.close()
def run(self, link, path):
self.export(link, path)
class Parser(Analyzer):
def __init__(self):
super(Parser, self).__init__()
self.article_list = []
self.page = -1
def parse(self, html_doc):
soup = BeautifulSoup(html_doc)
res = self.getContent(soup).find(class_="article-list").find_all(class_='article-item-box')
i = 0
for ele in res:
self.article_list.append(ele.h4.a['href'])
def getAllArticleLink(self, url):
self.page = 10 # 我的列表只有10页
for i in range(4, self.page + 1): # for test
print("work page", i, len(self.article_list))
self.parse(self.get(url + '/article/list/' + str(i)))
def export(self, path):
print("article count", len(self.article_list))
for link in self.article_list:
print("link", link)
exporter = Exporter()
exporter.run(link, path)
def run(self, url, path):
self.page = -1
self.article_list = []
print("getting-link")
self.getAllArticleLink(url)
print("now export")
self.export(path)
print("finished")
username = 'xxxxxx'
url = 'http://blog.csdn.net/' + username
parser = Parser()
parser.run(url, 'tmp')
参考
- Python 爬虫利器二之 Beautiful Soup 的用法
- 参考项目:https://github.com/gaocegege/csdn-blog-export
(两年前的CSDN搬家工具,目前的网页格式已经不一样了)
网友评论