美文网首页
简书HTML解析成Markdown(Python)

简书HTML解析成Markdown(Python)

作者: 素笺淡墨染流年 | 来源:发表于2018-12-03 11:43 被阅读62次

    想着把我的简书blog同步到自己的blog,这个工具应运而生了。(/滑稽)

    做的时候大概想的2个方案:

    • 登录简书下载md
    • 解析简书html为md

    第一个方案,登录的时候有那个什么图片点击验证放弃,所以就才用第二个方案。
    自己的blog后端是用Python写的,语言也就python了。
    github搜索了一下,库还蛮多的,我选择了其中的Tomd
    当然里面的规则并不能完全解析简书的html,比如图片规则,所以我对里面做了一些修改,改的不多。

    # tomd.py
    # coding: utf-8
    
    import re
    import os
    import warnings
    
    __all__ = ['Tomd', 'convert']
    
    MARKDOWN = {
        'h1': ('\n# ', '\n'),
        'h2': ('\n## ', '\n'),
        'h3': ('\n### ', '\n'),
        'h4': ('\n#### ', '\n'),
        'h5': ('\n##### ', '\n'),
        'h6': ('\n###### ', '\n'),
        'code': ('`', '`'),
        'ul': ('', ''),
        'ol': ('', ''),
        'li': ('- ', ''),
        'blockquote': ('\n> ', '\n'),
        'em': ('*', '*'),
        'strong': ('**', '**'),
        'block_code': ('\n```\n', '\n```\n'),
        'span': ('', ''),
        'p': ('\n', '\n'),
        'p_with_out_class': ('\n', '\n'),
        'inline_p': ('', ''),
        'inline_p_with_out_class': ('', ''),
        'b': ('**', '**'),
        'i': ('*', '*'),
        'del': ('~~', '~~'),
        'hr': ('\n---', '\n\n'),
        'thead': ('\n', '|------\n'),
        'tbody': ('\n', '\n'),
        'td': ('|', ''),
        'th': ('|', ''),
        'tr': ('', '\n'),
        'table': ('', '\n'),
        'e_p': ('', '\n'),
        'jianshu_img': ('', ''),
    }
    
    BlOCK_ELEMENTS = {
        'h1': '<h1.*?>(.*?)</h1>',
        'h2': '<h2.*?>(.*?)</h2>',
        'h3': '<h3.*?>(.*?)</h3>',
        'h4': '<h4.*?>(.*?)</h4>',
        'h5': '<h5.*?>(.*?)</h5>',
        'h6': '<h6.*?>(.*?)</h6>',
        'hr': '<hr/>',
        'blockquote': '<blockquote.*?>.*?>(.*?)</.*?</blockquote>',
        'ul': '<ul.*?>(.*?)</ul>',
        'ol': '<ol.*?>(.*?)</ol>',
        'block_code': '<pre.*?><code.*?>(.*?)</code></pre>',
        'p': '<p\s.*?>(.*?)</p>',
        'p_with_out_class': '<p>(.*?)</p>',
        'thead': '<thead.*?>(.*?)</thead>',
        'tr': '<tr.*?>(.*?)</tr>',
        'jianshu_img':'<div.+?image-package.*?>(.+?image-caption.+?</div>)[\s\S]*?</div>'
    }
    
    INLINE_ELEMENTS = {
        'td': '<td.*?>((.|\n)*?)</td>',  # td element may span lines
        'tr': '<tr.*?>((.|\n)*?)</tr>',
        'th': '<th.*?>(.*?)</th>',
        'b': '<b.*?>(.*?)</b>',
        'i': '<i.*?>(.*?)</i>',
        'del': '<del.*?>(.*?)</del>',
        'inline_p': '<p\s.*?>(.*?)</p>',
        'inline_p_with_out_class': '<p>(.*?)</p>',
        'code': '<code.*?>(.*?)</code>',
        'span': '<span.*?>(.*?)</span>',
        'ul': '<ul.*?>(.*?)</ul>',
        'ol': '<ol.*?>(.*?)</ol>',
        'li': '<li.*?>(.*?)</li>',
        'img': '<img.*?src="(.*?)".*?>(.*?)</img>',
        'img_single': '<img.*?src="(.*?)".*?/>',
        'img_single_no_close': '<img.*?src="(.*?)".*?>',
        'a': '<a.*?href="(.*?)".*?>(.*?)</a>',
        'em': '<em.*?>(.*?)</em>',
        'strong': '<strong.*?>(\s*)(.*?)(\s*)</strong>',
        'tbody': '<tbody.*?>((.|\n)*)</tbody>',
        'jianshu_img':'data-original-src="(.+?)"[\s\S]*image-caption">(.+?)</div>'
    }
    
    DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>', '<br clear="none"/>', '<center.*?>', '</center>']
    
    
    class Element:
        def __init__(self, start_pos, end_pos, content, tag, folder, is_block=False):
            self.start_pos = start_pos
            self.end_pos = end_pos
            self.content = content
            self._elements = []
            self.is_block = is_block
            self.tag = tag
            self.folder = folder
            self._result = None
    
            if self.is_block:
                self.parse_inline()
    
        def __str__(self):
            wrapper = MARKDOWN.get(self.tag)
            self._result = '{}{}{}'.format(wrapper[0], self.content, wrapper[1])
            return self._result
    
        def parse_inline(self):
            if self.tag == 'jianshu_img':
                m =re.search(INLINE_ELEMENTS.get(self.tag), self.content)
                self.content = '<img src="https:{}">{}</img>'.format(m.group(1),m.group(2))
    
            self.content = self.content.replace('\r', '')  # windows \r character
            self.content = self.content.replace('\xc2\xa0', ' ')  # no break space
            self.content = self.content.replace('&quot;', '\"')  # html quote mark
    
            for m in re.finditer("<img(.*?)en_todo.*?>", self.content):
                # remove img and change to [ ] and [x]
                # evernote specific parsing
                imgSrc = re.search('src=".*?"', m.group())
                imgLoc = imgSrc.group()[5:-1]  # remove source and " "
                imgLoc = imgLoc.replace('\\', '/')  # \\ folder slash rotate
                if os.stat(self.folder + "/" + imgLoc).st_size < 250:
                    self.content = self.content.replace(m.group(), "[ ] ")
                else:
                    self.content = self.content.replace(m.group(), "[x] ")
    
            if "e_" in self.tag:  # evernote-specific parsing
                for m in re.finditer(BlOCK_ELEMENTS['table'], self.content, re.I | re.S | re.M):
                    # hmm can there only be one table?
                    inner = Element(start_pos=m.start(),
                                    end_pos=m.end(),
                                    content=''.join(m.groups()),
                                    tag='table', folder=self.folder,
                                    is_block=True)
                    self.content = inner.content
                    return  # no need for further parsing ?
    
                # if no table, parse as usual
                self.content = self.content.replace('<hr/>', '\n---\n')
                self.content = self.content.replace('<br/>', '')
    
            if self.tag == "table":  # for removing tbody
                self.content = re.sub(INLINE_ELEMENTS['tbody'], '\g<1>', self.content)
    
            INLINE_ELEMENTS_LIST_KEYS = list(INLINE_ELEMENTS.keys())
            INLINE_ELEMENTS_LIST_KEYS.sort()
            for tag in INLINE_ELEMENTS_LIST_KEYS:
                pattern = INLINE_ELEMENTS[tag]
    
                if tag == 'a':
                    self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content)
                elif tag == 'img':
                    self.content = re.sub(pattern, '![\g<2>](\g<1>)', self.content)
                elif tag == 'img_single':
                    self.content = re.sub(pattern, '![](\g<1>)', self.content)
                elif tag == 'img_single_no_close':
                    self.content = re.sub(pattern, '![](\g<1>)', self.content)
                elif self.tag == 'ul' and tag == 'li':
                    self.content = re.sub(pattern, '- \g<1>\n', self.content)
                elif self.tag == 'ol' and tag == 'li':
                    self.content = re.sub(pattern, '1. \g<1>\n', self.content)
                elif self.tag == 'thead' and tag == 'tr':
                    self.content = re.sub(pattern, '\g<1>\n', self.content.replace('\n', ''))
                elif self.tag == 'tr' and tag == 'th':
                    self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
                elif self.tag == 'tr' and tag == 'td':
                    self.content = re.sub(pattern, '|\g<1>|', self.content.replace('\n', ''))
                    self.content = self.content.replace("||", "|")  # end of column also needs a pipe
                elif self.tag == 'table' and tag == 'td':
                    self.content = re.sub(pattern, '|\g<1>|', self.content)
                    self.content = self.content.replace("||", "|")  # end of column also needs a pipe
                    self.content = self.content.replace('|\n\n', '|\n')  # replace double new line
                    self.construct_table()
                else:
                    wrapper = MARKDOWN.get(tag)
                    if tag == "strong":
                        self.content = re.sub(pattern, '{}\g<2>{}'.format(wrapper[0], wrapper[1]), self.content)
                    else:
                        self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)
    
            if self.tag == "e_p" and self.content[-1:] != '\n' and len(self.content) > 2:
                # focusing on div, add new line if not there (and if content is long enough)
                self.content += '\n'
    
        def construct_table(self):
            # this function, after self.content has gained | for table entries,
            # adds the |---| in markdown to create a proper table
    
            temp = self.content.split('\n', 3)
            for elt in temp:
                if elt != "":
                    count = elt.count("|")  # count number of pipes
                    break
            pipe = "\n|"  # beginning \n for safety
            for i in range(count - 1):
                pipe += "---|"
            pipe += "\n"
            self.content = pipe + pipe + self.content + "\n"  # TODO: column titles?
            self.content = self.content.replace('|\n\n', '|\n')  # replace double new line
            self.content = self.content.replace("<br/>\n", "<br/>")  # end of column also needs a pipe
    
    
    class Tomd:
        def __init__(self, html='', folder='', file='', options=None):
            self.html = html  # actual data
            self.folder = folder
            self.file = file
            self.options = options  # haven't been implemented yet
            self._markdown = self.convert(self.html, self.options)
    
        def convert(self, html="", options=None):
            if html == "":
                html = self.html
            # main function here
            elements = []
            for tag, pattern in BlOCK_ELEMENTS.items():
                for m in re.finditer(pattern, html, re.I | re.S | re.M):
                    # now m contains the pattern without the tag
                    element = Element(start_pos=m.start(),
                                      end_pos=m.end(),
                                      content=''.join(m.groups()),
                                      tag=tag,
                                      folder=self.folder,
                                      is_block=True)
                    can_append = True
                    for e in elements:
                        if e.start_pos < m.start() and e.end_pos > m.end():
                            can_append = False
                        elif e.start_pos > m.start() and e.end_pos < m.end():
                            elements.remove(e)
                    if can_append:
                        elements.append(element)
            elements.sort(key=lambda element: element.start_pos)
            self._markdown = ''.join([str(e) for e in elements])
    
            for index, element in enumerate(DELETE_ELEMENTS):
                self._markdown = re.sub(element, '', self._markdown)
            return self._markdown
    
        @property
        def markdown(self):
            self.convert(self.html, self.options)
            return self._markdown
    
        def export(self, folder=False):
            if len(self.file) < 1:
                warnings.warn("file not specified, renamed to tmp.md")
                file = "tmp.md"
            else:
                file = self.file.replace('.html', '.md')  # rename to md
            if len(self.folder) < 2:
                warnings.warn("folder not specified, will save to pwd")
            elif not folder:
                file = self.folder + '/' + file
            else:  # if folder is specified
                file = folder + '/' + file
            f = open(file, 'w',encoding='utf8')
            f.write(self._markdown)
            f.close()
    
    
    _inst = Tomd()
    convert = _inst.convert
    
    

    然后上一个测试:

    import re
    
    import requests
    from bs4 import BeautifulSoup
    
    import tomd
    
    
    def main():
        headers = {
            'referer': 'https://www.jianshu.com/p/c75f1ce0a6ae',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
        }
    
        url = 'https://www.jianshu.com/p/c75f1ce0a6ae'
    
        res = requests.get(url=url, headers=headers)
    
        # print(res.text)
    
        soup = BeautifulSoup(res.text)
    
        content = soup.select_one('div.show-content-free')
        # print(content)
    
        m = re.search('div\sclass="show-content-free">([\s\S]*)</div>', str(content))
        if m:
            html = m.group(1)
            print(html)
            tomd.Tomd(html=html, file='test.md').export()
    
    
    if __name__ == '__main__':
        main()
    

    看看结果:

    gif.gif

    Github地址

    相关文章

      网友评论

          本文标题:简书HTML解析成Markdown(Python)

          本文链接:https://www.haomeiwen.com/subject/plfycqtx.html