美文网首页
html2markdown

html2markdown

作者: 夜空最亮的9星 | 来源:发表于2021-12-29 13:46 被阅读0次
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium import webdriver
    from markdownify import markdownify as md
    
    
    def parse_url(title,link_url):
        chromedriver_path = "/Users/user/chrome/chromedriver"
        options = webdriver.ChromeOptions()
        options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})  # 不加载图片,加快访问速度
        options.add_experimental_option('excludeSwitches', ['enable-automation'])  # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
        browser = webdriver.Chrome(executable_path=chromedriver_path, options=options)
        browser.get(link_url)
        WebDriverWait(browser, 10)
    
        s1 = browser.find_element_by_class_name("post-content")
        content = md(s1.get_attribute("outerHTML"))
        with open('doc/{}.md'.format(title), 'w+') as f:
            f.write("# " + title)
            f.write("\n")
            f.write("## 来源")
            f.write("\n")
            f.write(link_url)
            f.write("\n")
            f.write(content)
            f.write("\n\n")
            browser.close()
    

    相关文章

      网友评论

          本文标题:html2markdown

          本文链接:https://www.haomeiwen.com/subject/iopvqrtx.html