简书HTML解析成Markdown(Python)

作者: 素笺淡墨染流年 | 来源:发表于2018-12-03 11:43 被阅读62次

简书HTML解析成Markdown(Python)
简书markDown初级使用
2018-08-07写点东西吧
Android事件分发流程图
Towxml 3.0让微信小程序支持LaTex数学公式及yuml
使用MMMarkdown将Markdown文本解析成HTML
Day01 Python基础
通过RESTAPI文档生成SDK
让微信小程序支持html内容及代码高亮显示
重绘和重排（回流）

想着把我的简书blog同步到自己的blog，这个工具应运而生了。（/滑稽）

做的时候大概想的2个方案：

登录简书下载md
解析简书html为md

第一个方案，登录的时候有那个什么图片点击验证放弃，所以就才用第二个方案。
自己的blog后端是用Python写的，语言也就python了。
github搜索了一下，库还蛮多的，我选择了其中的Tomd，
当然里面的规则并不能完全解析简书的html,比如图片规则，所以我对里面做了一些修改，改的不多。

# tomd.py
# coding: utf-8

import re
import os
import warnings

__all__ = ['Tomd', 'convert']

MARKDOWN = {
    'h1': ('\n# ', '\n'),
    'h2': ('\n## ', '\n'),
    'h3': ('\n### ', '\n'),
    'h4': ('\n#### ', '\n'),
    'h5': ('\n##### ', '\n'),
    'h6': ('\n###### ', '\n'),
    'code': ('`', '`'),
    'ul': ('', ''),
    'ol': ('', ''),
    'li': ('- ', ''),
    'blockquote': ('\n> ', '\n'),
    'em': ('*', '*'),
    'strong': ('**', '**'),
    'block_code': ('\n```\n', '\n```\n'),
    'span': ('', ''),
    'p': ('\n', '\n'),
    'p_with_out_class': ('\n', '\n'),
    'inline_p': ('', ''),
    'inline_p_with_out_class': ('', ''),
    'b': ('**', '**'),
    'i': ('*', '*'),
    'del': ('~~', '~~'),
    'hr': ('\n---', '\n\n'),
    'thead': ('\n', '|------\n'),
    'tbody': ('\n', '\n'),
    'td': ('|', ''),
    'th': ('|', ''),
    'tr': ('', '\n'),
    'table': ('', '\n'),
    'e_p': ('', '\n'),
    'jianshu_img': ('', ''),
}

BlOCK_ELEMENTS = {
    'h1': '<h1.*?>(.*?)</h1>',
    'h2': '<h2.*?>(.*?)</h2>',
    'h3': '<h3.*?>(.*?)</h3>',
    'h4': '<h4.*?>(.*?)</h4>',
    'h5': '<h5.*?>(.*?)</h5>',
    'h6': '<h6.*?>(.*?)</h6>',
    'hr': '<hr/>',
    'blockquote': '<blockquote.*?>.*?>(.*?)</.*?</blockquote>',
    'ul': '<ul.*?>(.*?)</ul>',
    'ol': '<ol.*?>(.*?)</ol>',
    'block_code': '<pre.*?><code.*?>(.*?)</code></pre>',
    'p': '<p\s.*?>(.*?)</p>',
    'p_with_out_class': '<p>(.*?)</p>',
    'thead': '<thead.*?>(.*?)</thead>',
    'tr': '<tr.*?>(.*?)</tr>',
    'jianshu_img':'<div.+?image-package.*?>(.+?image-caption.+?</div>)[\s\S]*?</div>'
}

INLINE_ELEMENTS = {
    'td': '<td.*?>((.|\n)*?)</td>',  # td element may span lines
    'tr': '<tr.*?>((.|\n)*?)</tr>',
    'th': '<th.*?>(.*?)</th>',
    'b': '<b.*?>(.*?)</b>',
    'i': '<i.*?>(.*?)</i>',
    'del': '<del.*?>(.*?)</del>',
    'inline_p': '<p\s.*?>(.*?)</p>',
    'inline_p_with_out_class': '<p>(.*?)</p>',
    'code': '<code.*?>(.*?)</code>',
    'span': '<span.*?>(.*?)</span>',
    'ul': '<ul.*?>(.*?)</ul>',
    'ol': '<ol.*?>(.*?)</ol>',
    'li': '<li.*?>(.*?)</li>',
    'img': '<img.*?src="(.*?)".*?>(.*?)</img>',
    'img_single': '<img.*?src="(.*?)".*?/>',
    'img_single_no_close': '<img.*?src="(.*?)".*?>',
    'a': '<a.*?href="(.*?)".*?>(.*?)</a>',
    'em': '<em.*?>(.*?)</em>',
    'strong': '<strong.*?>(\s*)(.*?)(\s*)</strong>',
    'tbody': '<tbody.*?>((.|\n)*)</tbody>',
    'jianshu_img':'data-original-src="(.+?)"[\s\S]*image-caption">(.+?)</div>'
}

DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>', '<br clear="none"/>', '<center.*?>', '</center>']


class Element:
    def __init__(self, start_pos, end_pos, content, tag, folder, is_block=False):
        self.start_pos = start_pos
        self.end_pos = end_pos
        self.content = content
        self._elements = []
        self.is_block = is_block
        self.tag = tag
        self.folder = folder
        self._result = None

        if self.is_block:
            self.parse_inline()

    def __str__(self):
        wrapper = MARKDOWN.get(self.tag)
        self._result = '{}{}{}'.format(wrapper[0], self.content, wrapper[1])
        return self._result

    def parse_inline(self):
        if self.tag == 'jianshu_img':
            m =re.search(INLINE_ELEMENTS.get(self.tag), self.content)
            self.content = '<img src="https:{}">{}</img>'.format(m.group(1),m.group(2))

        self.content = self.content.replace('\r', '')  # windows \r character
        self.content = self.content.replace('\xc2\xa0', ' ')  # no break space
        self.content = self.content.replace('&quot;', '\"')  # html quote mark

        for m in re.finditer("<img(.*?)en_todo.*?>", self.content):
            # remove img and change to [ ] and [x]
            # evernote specific parsing
            imgSrc = re.search('src=".*?"', m.group())
            imgLoc = imgSrc.group()[5:-1]  # remove source and " "
            imgLoc = imgLoc.replace('\\', '/')  # \\ folder slash rotate
            if os.stat(self.folder + "/" + imgLoc).st_size < 250:
                self.content = self.content.replace(m.group(), "[ ] ")
            else:
                self.content = self.content.replace(m.group(), "[x] ")

        if "e_" in self.tag:  # evernote-specific parsing
            for m in re.finditer(BlOCK_ELEMENTS['table'], self.content, re.I | re.S | re.M):
                # hmm can there only be one table?
                inner = Element(start_pos=m.start(),
                                end_pos=m.end(),
                                content=''.join(m.groups()),
                                tag='table', folder=self.folder,
                                is_block=True)
                self.content = inner.content
                return  # no need for further parsing ?

            # if no table, parse as usual
            self.content = self.content.replace('<hr/>', '\n---\n')
            self.content = self.content.replace('<br/>', '')

        if self.tag == "table":  # for removing tbody
            self.content = re.sub(INLINE_ELEMENTS['tbody'], '\g<1>', self.content)

        INLINE_ELEMENTS_LIST_KEYS = list(INLINE_ELEMENTS.keys())
        INLINE_ELEMENTS_LIST_KEYS.sort()
        for tag in INLINE_ELEMENTS_LIST_KEYS:
            pattern = INLINE_ELEMENTS[tag]

            if tag == 'a':
                self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content)
            elif tag == 'img':
                self.content = re.sub(pattern, '![\g<2>](\g<1>)', self.content)
            elif tag == 'img_single':
                self.content = re.sub(pattern, '![](\g<1>)', self.content)
            elif tag == 'img_single_no_close':
                self.content = re.sub(pattern, '![](\g<1>)', self.content)
            elif self.tag == 'ul' and tag == 'li':
                self.content = re.sub(pattern, '- \g<1>\n', self.content)
            elif self.tag == 'ol' and tag == 'li':
                self.content = re.sub(pattern, '1. \g<1>\n', self.content)
            elif self.tag == 'thead' and tag == 'tr':
                self.content = re.sub(pattern, '\g<1>\n', self.content.replace('\n', ''))
            elif self.tag == 'tr' and tag == 'th':
                self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
            elif self.tag == 'tr' and tag == 'td':
                self.content = re.sub(pattern, '|\g<1>|', self.content.replace('\n', ''))
                self.content = self.content.replace("||", "|")  # end of column also needs a pipe
            elif self.tag == 'table' and tag == 'td':
                self.content = re.sub(pattern, '|\g<1>|', self.content)
                self.content = self.content.replace("||", "|")  # end of column also needs a pipe
                self.content = self.content.replace('|\n\n', '|\n')  # replace double new line
                self.construct_table()
            else:
                wrapper = MARKDOWN.get(tag)
                if tag == "strong":
                    self.content = re.sub(pattern, '{}\g<2>{}'.format(wrapper[0], wrapper[1]), self.content)
                else:
                    self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)

        if self.tag == "e_p" and self.content[-1:] != '\n' and len(self.content) > 2:
            # focusing on div, add new line if not there (and if content is long enough)
            self.content += '\n'

    def construct_table(self):
        # this function, after self.content has gained | for table entries,
        # adds the |---| in markdown to create a proper table

        temp = self.content.split('\n', 3)
        for elt in temp:
            if elt != "":
                count = elt.count("|")  # count number of pipes
                break
        pipe = "\n|"  # beginning \n for safety
        for i in range(count - 1):
            pipe += "---|"
        pipe += "\n"
        self.content = pipe + pipe + self.content + "\n"  # TODO: column titles?
        self.content = self.content.replace('|\n\n', '|\n')  # replace double new line
        self.content = self.content.replace("<br/>\n", "<br/>")  # end of column also needs a pipe


class Tomd:
    def __init__(self, html='', folder='', file='', options=None):
        self.html = html  # actual data
        self.folder = folder
        self.file = file
        self.options = options  # haven't been implemented yet
        self._markdown = self.convert(self.html, self.options)

    def convert(self, html="", options=None):
        if html == "":
            html = self.html
        # main function here
        elements = []
        for tag, pattern in BlOCK_ELEMENTS.items():
            for m in re.finditer(pattern, html, re.I | re.S | re.M):
                # now m contains the pattern without the tag
                element = Element(start_pos=m.start(),
                                  end_pos=m.end(),
                                  content=''.join(m.groups()),
                                  tag=tag,
                                  folder=self.folder,
                                  is_block=True)
                can_append = True
                for e in elements:
                    if e.start_pos < m.start() and e.end_pos > m.end():
                        can_append = False
                    elif e.start_pos > m.start() and e.end_pos < m.end():
                        elements.remove(e)
                if can_append:
                    elements.append(element)
        elements.sort(key=lambda element: element.start_pos)
        self._markdown = ''.join([str(e) for e in elements])

        for index, element in enumerate(DELETE_ELEMENTS):
            self._markdown = re.sub(element, '', self._markdown)
        return self._markdown

    @property
    def markdown(self):
        self.convert(self.html, self.options)
        return self._markdown

    def export(self, folder=False):
        if len(self.file) < 1:
            warnings.warn("file not specified, renamed to tmp.md")
            file = "tmp.md"
        else:
            file = self.file.replace('.html', '.md')  # rename to md
        if len(self.folder) < 2:
            warnings.warn("folder not specified, will save to pwd")
        elif not folder:
            file = self.folder + '/' + file
        else:  # if folder is specified
            file = folder + '/' + file
        f = open(file, 'w',encoding='utf8')
        f.write(self._markdown)
        f.close()


_inst = Tomd()
convert = _inst.convert

然后上一个测试:

import re

import requests
from bs4 import BeautifulSoup

import tomd


def main():
    headers = {
        'referer': 'https://www.jianshu.com/p/c75f1ce0a6ae',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
    }

    url = 'https://www.jianshu.com/p/c75f1ce0a6ae'

    res = requests.get(url=url, headers=headers)

    # print(res.text)

    soup = BeautifulSoup(res.text)

    content = soup.select_one('div.show-content-free')
    # print(content)

    m = re.search('div\sclass="show-content-free">([\s\S]*)</div>', str(content))
    if m:
        html = m.group(1)
        print(html)
        tomd.Tomd(html=html, file='test.md').export()


if __name__ == '__main__':
    main()

看看结果：

gif.gif

Github地址

简书HTML解析成Markdown(Python)
想着把我的简书blog同步到自己的blog，这个工具应运而生了。（/滑稽）做的时候大概想的2个方案：登录简书下...
简书markDown初级使用
为什么用markDown markDown是纯文本，可以解析成HTML。如何使用markDown 标题一级标题...
2018-08-07写点东西吧
markdown简书也支持所以特地注册了来试试，我是因为python，markdown终于来到简书。以前也听说过简...
Android事件分发流程图
简书的markdown编辑器貌似不能解析html，只能截图贴上，觉得模糊的朋友可以访问原博客地址。一、ViewG...
Towxml 3.0让微信小程序支持LaTex数学公式及yuml
Towxml 是一个让小程序可以解析Markdown、HTML的解析库。能够使小程序完美解析Markdown内容。...
使用MMMarkdown将Markdown文本解析成HTML
前言：MMMarkdown是一个可以将Markdown文本解析成HTML的Objective-C框架，它的使用环境...
Day01 Python基础
1. Python安装环境及相关软件 2. markdown与简书 2.1 markdown基本语法 2.1.1 ...
通过RESTAPI文档生成SDK
文档是通过 markdown 的形式来写的，通过python写一个解析工具来解析 markdown，把 API 的...
让微信小程序支持html内容及代码高亮显示
towxml是一个把html、markdown内容解析成微信小程序可显示的库。拥有极致的内容排版和高容错的解析。 ...
重绘和重排（回流）
一、浏览器渲染HTML的步骤 HTML被HTML解析器解析成DOM Tree, CSS则被CSS解析器解析成CSS...