写在前面
由于在简书上写的文章是mardown格式的,直接复制下来比较乱,怎么采用爬虫的方式将文章获取下来呢?
分析
1、要获取的文章地址:
- url:https://www.jianshu.com
- 用户:/u/39cef8a56bf9
- 分类:获取我的分类,并存入list或字典,如:/nb/23293583
- 文章地址:获取一个分类下的所有文章地址,如:/p/3cb9bf01825c
获取文章内容:标题 作者 字数 文章正文(正文可以根据不同的标签设定不同的字体或格式)
2、要用到的库:
- requests:发送get请求,获取页面内容
- Beautifulsoap:解析html
- pdfkit:将html转换为pdf
- 由于pdfkit是工具wkhtmltopdf的封装,所以需要现在本机安装wkhtmltopdf,并将bin目录添加到环境变量path中
- wkhtmltopdf下载地址:https://wkhtmltopdf.org/downloads.html
3、想要输出的pdf格式:
- 以简书中的分类命名pdf文件
- 以每篇文章的标题为pdf文件中的文章标题
技术准备
1、requests发送请求
- 添加header信息,不然会报403。headers中添加浏览器信息就可以啦,如果还是报403,证明你的user-agent信息不对,换一个
2、beautifulsoap解析html
- 1、Beautiful定位元素信息
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
3、pdfkit将html转换为pdf
实现步骤
1、
2、
3、
4、
改进:
1、处理文章页面的html,删除article之后的所有节点,然后保存html
第一版
实现点:每个文集生成了一个pdf文件,但是每篇文章中包含了页面中的推荐文章之类的,信息点不集中。
改进点:对html进行处理后,采用from_file进行转换。将每个url处理后的html保存为文件,再将文件名列表传给from_file转换为pdf文件。
import re
import pdfkit
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.PhantomJS()
base_url = 'https://www.jianshu.com'
user = '/u/39cef8a56bf9'
# 获取页面html信息
def get_html(driver, url):
driver.get(url)
html = driver.page_source
return html
# 将html实例化为Beautiful Soup对象
def html_to_bsobj(html):
bsobj = BeautifulSoup(html)
bsobj.encode('utf-8')
return bsobj
# 解析首页html,获取文集的链接信息
def get_essays(bsobj):
essays = bsobj.find_all('a', attrs={'class': 'name'}, href=re.compile("^/nb/"))
# print(essays)
essay_dict = {}
for essay in essays:
essay_link = essay['href']
essay_name = essay.text
essay_dict[essay_name.strip()] = essay_link
return essay_dict
def get_articles(bsobj):
articles = bsobj.find_all('a', attrs={'class': 'title'}, href=re.compile("^/p/"))
article_urls = []
for article in articles:
article_link = article['href']
article_urls.append(base_url+article_link)
return article_urls
def save_pdf(article_urls, filename):
# options = {
# 'page-size': 'Letter',
# 'encoding': "UTF-8",
# 'custom-header': [
# ('Accept-Encoding', 'gzip')
# ]
# }
pdfkit.from_url(article_urls, output_path=filename+'.pdf')
if __name__ =='__main__':
index_page = get_html(driver, base_url+user)
index_bsobj = html_to_bsobj(index_page)
# print(get_essays(index_bsobj))
for essay_name, essay_link in get_essays(index_bsobj).items():
print(essay_link)
essay_page = get_html(driver, base_url+essay_link)
essay_bsobj = html_to_bsobj(essay_page)
article_urls = get_articles(essay_bsobj)
save_pdf(article_urls, essay_name)
改进版:先这样吧
"""
Created by catleer on 2018-06-06.
"""
__author__ = 'catleer'
import codecs
import re, time
import pdfkit
from selenium import webdriver
from bs4 import BeautifulSoup
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
{content}
</body>
</html>
"""
driver = webdriver.PhantomJS()
base_url = 'https://www.jianshu.com'
user = '/u/39cef8a56bf9'
# 将html实例化为Beautiful Soup对象
def html_to_bsobj(driver, url):
driver.get(url)
time.sleep(10)
html = driver.page_source
bsobj = BeautifulSoup(html)
bsobj.encode('utf-8')
return bsobj
# 解析首页html,获取文集的链接信息
def get_essays(bsobj):
essays = bsobj.find_all('a', attrs={'class': 'name'}, href=re.compile("^/nb/"))
# print(essays)
essay_dict = {}
for essay in essays:
essay_link = essay['href']
essay_name = essay.text
essay_dict[essay_name.strip()] = essay_link
return essay_dict
# 获得文章的链接信息
def get_articles(bsobj):
articles = bsobj.find_all('a', attrs={'class': 'title'}, href=re.compile("^/p/"))
article_urls = []
for article in articles:
article_link = article['href']
article_urls.append(base_url+article_link)
print("排序前", article_urls)
article_urls.reverse()
return article_urls
# 解析文章的html页面,将文章正文存入html中,并对图片进行处理
def parse_article_html(article_page):
body = article_page.find_all(class_='article')[0]
body = str(body)
# 图片将相对路径处理为绝对路径
pattern = "(<img .*?data-original-src=\")(.*?)(\")"
def func(m):
if not m.group(2).startswith("http"):
rtn = "".join([m.group(1), 'https:', m.group(2), m.group(3)])
return rtn
else:
return "".join([m.group(1), m.group(2), m.group(3)])
html = re.compile(pattern).sub(func, str(body))
html = html_template.format(content=html)
return html
def save_pdf():
options = {
'page-size': 'Letter',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'outline-depth': 10,
}
index_bsobj = html_to_bsobj(driver, base_url + user)
essays = get_essays(index_bsobj)
print(essays)
for essay_name, essay_link in essays.items():
essay_bsobj = html_to_bsobj(driver, base_url + essay_link)
article_urls = get_articles(essay_bsobj)
# print(article_urls)
htmls = []
for index, article_url in enumerate(article_urls):
article_page = html_to_bsobj(driver, article_url)
time.sleep(10)
# print(article_page)
html = parse_article_html(article_page)
f_name = '.'.join([essay_name + str(index), 'html'])
with codecs.open(f_name, 'w+', 'utf-8') as f:
f.write(html)
htmls.append(f_name)
break
pdfkit.from_file(htmls, essay_name+'.pdf', options=options)
if __name__ == '__main__':
save_pdf()
网友评论