from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from markdownify import markdownify as md
def parse_url(title,link_url):
chromedriver_path = "/Users/user/chrome/chromedriver"
options = webdriver.ChromeOptions()
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) # 不加载图片,加快访问速度
options.add_experimental_option('excludeSwitches', ['enable-automation']) # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
browser = webdriver.Chrome(executable_path=chromedriver_path, options=options)
browser.get(link_url)
WebDriverWait(browser, 10)
s1 = browser.find_element_by_class_name("post-content")
content = md(s1.get_attribute("outerHTML"))
with open('doc/{}.md'.format(title), 'w+') as f:
f.write("# " + title)
f.write("\n")
f.write("## 来源")
f.write("\n")
f.write(link_url)
f.write("\n")
f.write(content)
f.write("\n\n")
browser.close()
网友评论