# 爬取古诗文网
# 使用re、正则表达式findall函数
import re
import requests
def data_capture(url):
Contents = []
poems = []
headers = {
'Use-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'referer':'https://www.gushiwen.org/default_1.aspx'
}
response = requests.get(url,headers = headers)
text = response.text
title = re.findall(r'<div class="cont">.*?<b>(.*?)</b>',text,re.DOTALL) # re.DOTALL的用法将 . 匹配到换行符
'''
.*? # 非贪婪匹配到<b>标签
(.*?) # 非贪婪匹配从<b>到</b>标签的文字
'''
dynasty = re.findall('<p class="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
'''
.*? # 非贪婪匹配到<a>标签
<a.*?> # 非贪婪匹配到>标签
(.*?)</a> # 非贪婪匹配>到</a>标签中的文字
'''
poet = re.findall('<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',text,re.DOTALL)
'''
.*? # P标签下面有两个a标签,一个a标签是年代,一个是名字
<a.*?> # 第一个a标签内容
.*? # 第一个标签a后面的内容
<a.*?> # 匹配到第二个a标签内容
'''
contens = re.findall('<div class="contson".*?>(.*?)</div>',text,re.DOTALL)
for conten in contens:
Content = re.sub('<br\s/>|<p>|</p>','',conten)
Contents.append(Content.strip())
for value in zip(title,dynasty,poet,Contents):
title,dynasty,poet,Contents = value
poem = {
'题目':title,
'朝代':dynasty,
'诗人':poet,
'内容':Contents
}
poems.append(poem)
def spider():
Datas = []
urls = 'https://www.gushiwen.org/default_{}.aspx'
for i in range(1,11):
url = urls.format(i)
data_capture(url)
break
spider()
'''
ps:总结,这次爬虫使用的是re,request,findall,其中findall寻找标签下的内容通过()去完成,贪婪模式和非贪婪模式是重点
re.DOTALL的用法将 . 匹配到换行符
'''
网友评论