最后输出PDF格式,暂不稳定,有待改进。
import requests
from lxml import etree
from bs4 import BeautifulSoup
import pdfkit
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
KEYWORD = 'duhaoshu'
# 判断该公众号是否被传送门网站收录。
def judge(nameid):
url = 'http://chuansong.me/account/' + nameid + '?start=' + str(0)
response =requests.get(url,headers=headers)
html = etree.HTML(response.text)
url = html.xpath('//div[contains(@class,"topic_page")]/h1/text()')
if '404' in url:
print('暂未收录该公众号的文章。')
else:
pass
def parse_one_page(url):
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.text,'lxml')
data = soup.find_all('div',{'tabindex':'-1'})
for html in data:
detail_url = html.find_all('a')[0].get('href')
full_url = 'http://chuansong.me' + detail_url
parse_detail(full_url)
break
def parse_detail(full_url):
r = requests.get(full_url, headers=headers)
data = etree.HTML(r.text)
title = data.xpath('//h2[@class="rich_media_title"]/text()')[0]
publish_time = data.xpath('//em[@id="publish_time"]/text()')[0]
# url 装换成 pdf
path_wk = r'F:\Downloads\html-pdf\wkhtmltopdf\bin\wkhtmltopdf.exe' # 安装wkhtmltopdf的位置
config = pdfkit.configuration(wkhtmltopdf=path_wk)
print(title, publish_time)
pdfkit.from_url(full_url,title + '.pdf', configuration=config)
print('转换成功!')
def main():
for page in range(1):
url = 'http://chuansong.me/account/' + KEYWORD + '?start={}'.format(page)
parse_one_page(url)
if __name__ == '__main__':
main()
网友评论